xorbitsai · fengsxy · Nov 14, 2023 · Nov 14, 2023 · Nov 14, 2023 · Nov 14, 2023
diff --git a/.github/workflows/build-wheel.yaml b/.github/workflows/build-wheel.yaml
@@ -21,26 +21,14 @@ jobs:
       matrix:
         os: [ubuntu-latest, macos-latest, windows-latest]
         arch: [auto]
-        requires-python: [">=3.8,<3.10", ">=3.10,<3.12"]
+        requires-python: [">=3.9,<3.12"]
         include:
           - os: ubuntu-latest
             arch: aarch64
-            requires-python: ">=3.8,<3.9"
-          - os: ubuntu-latest
-            arch: aarch64
-            requires-python: ">=3.9,<3.10"
-          - os: ubuntu-latest
-            arch: aarch64
-            requires-python: ">=3.10,<3.11"
-          - os: ubuntu-latest
-            arch: aarch64
-            requires-python: ">=3.11,<3.12"
-          - os: macos-latest
-            arch: universal2
-            requires-python: ">=3.8,<3.10"
+            requires-python: ">=3.9,<3.12"
           - os: macos-latest
             arch: universal2
-            requires-python: ">=3.10,<3.12"
+            requires-python: ">=3.9,<3.12"
 
     steps:
       - uses: actions/checkout@v3

diff --git a/.github/workflows/docker-cd.yaml b/.github/workflows/docker-cd.yaml
@@ -17,7 +17,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: [ "3.8", "3.9", "3.10", "3.11" ]
+        python-version: [ "3.9", "3.10", "3.11" ]
     steps:
       - name: Check out code
         uses: actions/checkout@v3

diff --git a/.github/workflows/python.yaml b/.github/workflows/python.yaml
@@ -40,14 +40,15 @@ jobs:
         with:
           src: "python/xorbits"
           options: "--check"
+          version: "23.12.0"
       - uses: isort/isort-action@master
         with:
           sortPaths: "python/xorbits"
           configuration: "--check-only --diff --sp python/setup.cfg"
       - name: mypy
         run: pip install mypy && cd python && mypy xorbits
       - name: codespell
-        run: pip install codespell && cd python && codespell xorbits
+        run: pip install codespell==2.2.6 && cd python && codespell xorbits
       - name: Set up Node.js
         uses: actions/setup-node@v1
         with:
@@ -73,7 +74,7 @@ jobs:
       fail-fast: false
       matrix:
         os: ["ubuntu-latest", "macos-latest", "windows-latest"]
-        python-version: ["3.8", "3.9", "3.10", "3.11"]
+        python-version: ["3.9", "3.10", "3.11"]
         module: ["xorbits", "kubernetes"]
         exclude:
           - { os: macos-latest, python-version: 3.10}
@@ -110,7 +111,7 @@ jobs:
       uses: microsoft/setup-msbuild@v1.1
 
     - name: Set up conda ${{ matrix.python-version }}
-      uses: conda-incubator/setup-miniconda@v2
+      uses: conda-incubator/setup-miniconda@v3
       if: ${{ matrix.module != 'gpu' && matrix.module != 'kubernetes-juicefs' }}
       with:
         python-version: ${{ matrix.python-version }}
@@ -122,20 +123,23 @@ jobs:
       with:
         driver: none
         kubernetes-version: v1.23.12
+        minikube-version: 1.31.2
 
     - name: Install ucx dependencies
       if: ${{ (matrix.module != 'gpu') && (matrix.os == 'ubuntu-latest') && (matrix.python-version != '3.11') }}
       run: |
         conda install -c conda-forge -c rapidsai ucx-proc=*=cpu ucx ucx-py
-
+    - name: Install libomp (macOS)
+      if: matrix.os == 'macos-latest'
+      run: brew install libomp
     - name: Install dependencies
       env:
         MODULE: ${{ matrix.module }}
         PYTHON: ${{ matrix.python-version }}
       if: ${{ matrix.module != 'gpu' }}
       run: |
         pip install -e "git+https://github.com/xorbitsai/xoscar.git@main#subdirectory=python&egg=xoscar"
-        pip install numpy scipy cython pyftpdlib coverage flaky "numexpr<2.8.5"
+        pip install "numpy<2.0.0" scipy cython pyftpdlib coverage flaky "numexpr<2.8.5"
 
         if [[ "$MODULE" == "xorbits" ]]; then
           pip install openpyxl
@@ -228,7 +232,7 @@ jobs:
         fi
         if [[ "$MODULE" == "compatibility" ]]; then
           # test if compatible with older versions
-          pip install pandas==1.5.3 pyarrow\<12.0.0 sqlalchemy\<2
+          pip install "pandas==1.5.3" "scipy<=1.10.1" "numpy<=1.24.1" "matplotlib<=3.7.0" "pyarrow<12.0.0" "sqlalchemy<2"
         fi
         if [[ "$MODULE" == "jax" ]]; then
           # test jax
@@ -306,6 +310,7 @@ jobs:
         elif [[ "$MODULE" == "hadoop" ]]; then
           export WITH_HADOOP="1"
           export HADOOP_HOME="/usr/local/hadoop"
+          export CLASSPATH=`$HADOOP_HOME/bin/hadoop classpath --glob`
           export HADOOP_INSTALL=$HADOOP_HOME
           export HADOOP_MAPRED_HOME=$HADOOP_HOME
           export HADOOP_COMMON_HOME=$HADOOP_HOME

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,7 +1,7 @@
 files: python/xorbits
 repos:
   - repo: https://github.com/psf/black
-    rev: 23.7.0
+    rev: 23.12.0
     hooks:
       - id: black
   - repo: https://github.com/pre-commit/pre-commit-hooks
@@ -10,7 +10,7 @@ repos:
       - id: end-of-file-fixer
       - id: trailing-whitespace
   - repo: https://github.com/PyCQA/flake8
-    rev: 6.1.0
+    rev: 6.0.0
     hooks:
       - id: flake8
         args: [--config, python/setup.cfg]
@@ -20,17 +20,12 @@ repos:
       - id: isort
         args: [--sp, python/setup.cfg]
   - repo: https://github.com/pre-commit/mirrors-mypy
-    rev: v1.4.1
+    rev: v1.9.0
     hooks:
       - id: mypy
         additional_dependencies: [tokenize-rt==3.2.0]
         exclude: _mars
         args: [--ignore-missing-imports, --follow-imports, skip]
-  - repo: https://github.com/pre-commit/mirrors-prettier
-    rev: v3.0.0 # Use the sha or tag you want to point at
-    hooks:
-      - id: prettier
-        types_or: [html, javascript]
   - repo: https://github.com/codespell-project/codespell
     rev: v2.2.6
     hooks:

diff --git a/CI/install-hadoop.sh b/CI/install-hadoop.sh
@@ -108,6 +108,7 @@ EOF
 cat > /tmp/hadoop.sh << EOF
 #!/bin/bash
 export HADOOP_HOME=/usr/local/hadoop
+export CLASSPATH=`$HADOOP_HOME/bin/hadoop classpath --glob`
 EOF
 sudo mv /tmp/hadoop.sh /etc/profile.d/
 sudo chmod a+x /etc/profile.d/hadoop.sh
@@ -135,6 +136,7 @@ sudo ln -s "/usr/local/hadoop-$VERSION" /usr/local/hadoop
 sudo chown -R hduser:hadoop "/usr/local/hadoop-$VERSION"
 
 export HADOOP_HOME=/usr/local/hadoop
+export CLASSPATH=`$HADOOP_HOME/bin/hadoop classpath --glob`
 
 # enable ssh login without password
 sudo su - hduser -c "ssh-keygen -t rsa -P \"\" -f /home/hduser/.ssh/id_rsa"

diff --git a/CI/requirements-wheel.txt b/CI/requirements-wheel.txt
@@ -1,16 +1,14 @@
 oldest-supported-numpy
 
-pandas==1.0.4; python_version<'3.9' and platform_machine!='aarch64'
-pandas==1.1.3; python_version<'3.9' and platform_machine=='aarch64'
 pandas==1.2.2; python_version>='3.9' and python_version<'3.10'
 pandas==1.3.4; python_version>='3.10' and python_version<'3.11'
-pandas==1.5.0; python_version>='3.11'
+pandas==1.5.1; python_version>='3.11' and python_version<'3.12'
+
+numpy<2.0.0
 
-scipy==1.4.1; python_version<'3.9' and platform_machine!='aarch64'
-scipy==1.5.3; python_version<'3.9' and platform_machine=='aarch64'
 scipy==1.5.4; python_version>='3.9' and python_version<'3.10'
 scipy==1.7.2; python_version>='3.10' and python_version<'3.11'
-scipy==1.10.0; python_version>='3.11'
+scipy==1.10.0; python_version>='3.11' and python_version<'3.12'
 
 # see: https://github.com/cython/cython/commit/afc00fc3ba5d43c67151c0039847a526e7b627a5
 cython==0.29.33

diff --git a/README.md b/README.md
@@ -70,6 +70,7 @@ pip install xorbits
 * [Examples and Tutorials](https://doc.xorbits.io/en/latest/getting_started/examples.html)
 * [Performance Benchmarks](https://xorbits.io/benchmark)
 * [Development Guide](https://doc.xorbits.io/en/latest/development/index.html)
+* [Research Paper on Xorbits' Internals](https://arxiv.org/abs/2401.00865)
 
 ## License
 [Apache 2](LICENSE)
@@ -101,3 +102,19 @@ with other upcoming ones we will propose. Stay tuned!
 | [Slack](https://join.slack.com/t/xorbitsio/shared_invite/zt-1o3z9ucdh-RbfhbPVpx7prOVdM1CAuxg) | Collaborating with other Xorbits users.            |
 | [StackOverflow](https://stackoverflow.com/questions/tagged/xorbits)                           | Asking questions about how to use Xorbits.         |
 | [Twitter](https://twitter.com/xorbitsio)                                                      | Staying up-to-date on new features.                |
+
+## Citing Xorbits
+
+If Xorbits could help you, please cite our paper which is accepted by ICDE 2024 Industry and Applications Track:
+
+```
+@article{lu2023xorbits,
+  title={Xorbits: Automating Operator Tiling for Distributed Data Science}, 
+  author={Weizheng Lu and Kaisheng He and Xuye Qin and Chengjie Li and Zhong Wang and Tao Yuan and Feng Zhang and Yueguo Chen and Xiaoyong Du},
+  year={2023},
+  archivePrefix={arXiv},
+  url={https://doi.org/10.48550/arXiv.2401.00865},
+  eprinttype={arXiv}，
+  eprint={2401.00865},
+}
+```
diff --git a/doc/source/libraries/xorbits_train/lightgbm.rst b/doc/source/libraries/xorbits_train/lightgbm.rst
@@ -1,8 +1,8 @@
 .. _10min_lightgbm:
 
-===================================
+======================================
 10 minutes to :code:`xorbits.lightgbm`
-===================================
+======================================
 
 .. currentmodule:: xorbits.lightgbm
 
@@ -12,88 +12,113 @@ Let's take :code:`LGBMRegressor` as an example and explain how to build a regres
 
 Customarily, we import and init as follows:
 
-.. ipython:: python
+::
+
+    >>> import xorbits
+    >>> import xorbits.numpy as np
+    >>> from xorbits.lightgbm import LGBMRegressor
+    >>> from xorbits.sklearn.model_selection import train_test_split
+    >>> xorbits.init()
 
-   import xorbits
-   import xorbits.numpy as np
-   from xorbits.lightgbm import LGBMRegressor
-   from xorbits.sklearn.model_selection import train_test_split
-   xorbits.init()
 
 Model Creation
----------------
+--------------
 First, we build a :code:`LGBMRegressor` model and define its parameters.
 
 This model has many adjustable hyperparameters that allow you to configure parameters such as tree depth, the number of leaf nodes, learning rate, and more to optimize the model's performance.
 
-..  ipython:: python
+::
+
+    >>> lgbm_regressor = LGBMRegressor(learning_rate=0.05, n_estimators=100)
 
-    lgbm_regressor = LGBMRegressor(learning_rate=0.05,n_estimators=100)
 
 :code:`.get_params` method returns a dictionary containing all the parameter names of the model along with their corresponding values. You can inspect these values to understand the current configuration of the model.
 
 Inspect the parameters of the LightGBM regressor.
 
-..  ipython:: python
+::
+
+    >>> paras=lgbm_regressor.get_params()
+    >>> paras
+    {'boosting_type': 'gbdt', 'class_weight': None, 'colsample_bytree': 1.0, 
+    'importance_type': 'split', 'learning_rate': 0.05, 'max_depth': -1, 
+    'min_child_samples': 20, 'min_child_weight': 0.001, 'min_split_gain': 0.0, 
+    'n_estimators': 100, 'n_jobs': None, 'num_leaves': 31, 'objective': None, 
+    'random_state': None, 'reg_alpha': 0.0, 'reg_lambda': 0.0, 'subsample': 1.0, 
+    'subsample_for_bin': 200000, 'subsample_freq': 0}
 
-    paras=lgbm_regressor.get_params()
-    paras
 
 Set/modify parameters.
 
 :code:`.set_params` method allows you to dynamically modify the parameter settings of a machine learning model by specifying parameter names and their corresponding values, without the need to recreate the model object.
 
-..  ipython:: python
+::
+
+    >>> lgbm_regressor.set_params(learning_rate=0.1, n_estimators=100)
+    >>> lgbm_regressor.get_params()
+    {'boosting_type': 'gbdt', 'class_weight': None, 'colsample_bytree': 1.0, 
+    'importance_type': 'split', 'learning_rate': 0.1, 'max_depth': -1, 
+    'min_child_samples': 20, 'min_child_weight': 0.001, 'min_split_gain': 0.0, 
+    'n_estimators': 100, 'n_jobs': None, 'num_leaves': 31, 'objective': None, 
+    'random_state': None, 'reg_alpha': 0.0, 'reg_lambda': 0.0, 'subsample': 1.0, 
+    'subsample_for_bin': 200000, 'subsample_freq': 0}
 
-    lgbm_regressor.set_params(learning_rate=0.1, n_estimators=100)
-    lgbm_regressor.get_params()
 
 Data Preparation
----------------
+----------------
 We can use real data as input. For the sake of simplicity, we will use randomly generated x and y data as an example.
 
-..  ipython:: python
+::
+
+    >>> x = np.random.rand(100)
+    >>> y_regression = 2 * x + 1 + 0.1 * np.random.randn(100)
+    >>> x=x.reshape(-1, 1)
 
-    x = np.random.rand(100)
-    y_regression = 2 * x + 1 + 0.1 * np.random.randn(100)
-    x=x.reshape(-1, 1)
 
 In order to train the model, we split the dataset into a training set and a test set.
 
-..  ipython:: python
+::
+
+    >>> X_train, X_test, y_train, y_test = train_test_split(x, y_regression, test_size=0.2)
 
-    X_train, X_test, y_train, y_test = train_test_split(x, y_regression, test_size=0.2)
 
 Model Training
----------------
+--------------
 The :code:`.fit` method takes the training data (independent variable x and dependent variable y) and fits the model to the data.
 
 The model adjusts its parameters to minimize the error between the predicted values and the actual observations.
 
-..  ipython:: python
+::
+
+    >>> lgbm_regressor.fit(X_train, y_train)
 
-    lgbm_regressor.fit(X_train, y_train)
 
 Model Prediction
----------------
+----------------
 
 Once you have trained a model, you can use the :code:`.predict` method to apply that model to new data and generate predictions for the new data.
 
-..  ipython:: python
+::
+
+    >>> y_pred = lgbm_regressor.predict(X_test)
+    >>> y_pred
+    array([1.81201326, 2.64419095, 1.56956112, 1.56956112, 1.22159593,
+       2.64419095, 2.64419095, 2.11885373, 1.81201326, 1.22159593,
+       2.64419095, 1.22159593, 1.56956112, 2.64419095, 2.64419095,
+       2.64419095, 1.90073406, 2.64419095, 1.81201326, 1.81201326])
 
-    y_pred = lgbm_regressor.predict(X_test)
-    y_pred
 
 Model Evaluation
----------------
+----------------
 
 :code:`.score` is typically used to assess the performance of a machine learning model.
 
 In regression problems, the :code:`.score` method usually returns the coefficient of determination (R-squared) score, which represents the model's ability to explain the variability in the dependent variable.
 
 Calculate the model's estimated accuracy on the test set.
 
-..  ipython:: python
+::
 
-    accuracy = lgbm_regressor.score(X_test, y_test)
-    accuracy
+    >>> accuracy = lgbm_regressor.score(X_test, y_test)
+    >>> accuracy
+    0.9323625100248668