zilliztech · alwayslove2013 · Jan 9, 2025 · Jan 8, 2025
diff --git a/.github/workflows/pull_request.yml b/.github/workflows/pull_request.yml
@@ -31,6 +31,10 @@ jobs:
           python -m pip install --upgrade pip
           pip install -e ".[test]"
 
+      - name: Run coding checks
+        run: |
+          make lint
+
       - name: Test with pytest
         run: |
           make unittest
diff --git a/.ruff.toml b/.ruff.toml
diff --git a/Makefile b/Makefile
@@ -1,2 +1,10 @@
 unittest:
 	PYTHONPATH=`pwd` python3 -m pytest tests/test_dataset.py::TestDataSet::test_download_small -svv
+
+format:
+	PYTHONPATH=`pwd` python3 -m black vectordb_bench
+	PYTHONPATH=`pwd` python3 -m ruff check vectordb_bench --fix
+
+lint:
+	PYTHONPATH=`pwd` python3 -m black vectordb_bench --check
+	PYTHONPATH=`pwd` python3 -m ruff check vectordb_bench
diff --git a/README.md b/README.md
@@ -240,13 +240,13 @@ After reopen the repository in container, run `python -m vectordb_bench` in the
 
 ### Check coding styles
 ```shell
-$ ruff check vectordb_bench
+$ make lint
 ```
 
-Add `--fix` if you want to fix the coding styles automatically
+To fix the coding styles automatically
 
 ```shell
-$ ruff check vectordb_bench --fix
+$ make format
 ```
 
 ## How does it work?

diff --git a/pyproject.toml b/pyproject.toml
@@ -44,6 +44,7 @@ dynamic = ["version"]
 
 [project.optional-dependencies]
 test = [
+    "black",
     "ruff",
     "pytest",
 ]
@@ -93,3 +94,116 @@ init_bench = "vectordb_bench.__main__:main"
 vectordbbench = "vectordb_bench.cli.vectordbbench:cli"
 
 [tool.setuptools_scm]
+
+[tool.black]
+line-length = 120
+target-version = ['py311']
+include = '\.pyi?$'
+
+[tool.ruff]
+lint.select = [
+    "E",
+    "F",
+    "C90",
+    "I",
+    "N",
+    "B", "C", "G",
+    "A",
+    "ANN001",
+    "S", "T", "W", "ARG", "BLE", "COM", "DJ", "EM", "ERA", "EXE", "FBT", "ICN", "INP", "ISC", "NPY", "PD", "PGH", "PIE", "PL", "PT", "PTH", "PYI", "RET", "RSE", "RUF", "SIM", "SLF", "TCH", "TID", "TRY", "UP", "YTT"
+]
+lint.ignore = [
+    "BLE001", # blind-except (BLE001)
+    "SLF001", # SLF001 Private member accessed [E]
+    "TRY003", # [ruff] TRY003 Avoid specifying long messages outside the exception class [E]
+    "FBT001", "FBT002", "FBT003",
+    "G004", # [ruff] G004 Logging statement uses f-string [E]
+    "UP031",
+    "RUF012",
+    "EM101",
+    "N805",
+    "ARG002",
+    "ARG003",
+    "PIE796", # https://github.com/zilliztech/VectorDBBench/issues/438
+    "INP001", # TODO
+    "TID252", # TODO
+    "N801", "N802", "N815",
+    "S101", "S108", "S603", "S311",
+    "PLR2004",
+    "RUF017",
+    "C416",
+    "PLW0603",
+]
+
+# Allow autofix for all enabled rules (when `--fix`) is provided.
+lint.fixable = [
+    "A", "B", "C", "D", "E", "F", "G", "I", "N", "Q", "S", "T", "W",
+    "ANN", "ARG", "BLE", "COM", "DJ", "DTZ", "EM", "ERA", "EXE", "FBT",
+    "ICN", "INP", "ISC", "NPY", "PD", "PGH", "PIE", "PL", "PT", "PTH",
+    "PYI", "RET", "RSE", "RUF", "SIM", "SLF", "TCH", "TID", "TRY", "UP",
+    "YTT",
+]
+lint.unfixable = []
+
+show-fixes = true
+
+# Exclude a variety of commonly ignored directories.
+exclude = [
+    ".bzr",
+    ".direnv",
+    ".eggs",
+    ".git",
+    ".git-rewrite",
+    ".hg",
+    ".mypy_cache",
+    ".nox",
+    ".pants.d",
+    ".pytype",
+    ".ruff_cache",
+    ".svn",
+    ".tox",
+    ".venv",
+    "__pypackages__",
+    "_build",
+    "buck-out",
+    "build",
+    "dist",
+    "node_modules",
+    "venv",
+    "grpc_gen",
+    "__pycache__",
+    "frontend", # TODO
+    "tests",
+]
+
+# Same as Black.
+line-length = 120
+
+# Allow unused variables when underscore-prefixed.
+lint.dummy-variable-rgx = "^(_+|(_+[a-zA-Z0-9_]*[a-zA-Z0-9]+?))$"
+
+# Assume Python 3.11
+target-version = "py311"
+
+[tool.ruff.lint.mccabe]
+# Unlike Flake8, default to a complexity level of 10.
+max-complexity = 18
+
+[tool.ruff.lint.pycodestyle]
+max-line-length = 120
+max-doc-length = 120
+
+[tool.ruff.lint.pylint]
+max-args = 20
+max-branches = 15
+
+[tool.ruff.lint.flake8-builtins]
+builtins-ignorelist = [
+    # "format",
+    # "next",
+    # "object", # TODO
+    # "id",
+    # "dict", # TODO
+    # "filter",
+]
+
diff --git a/vectordb_bench/__init__.py b/vectordb_bench/__init__.py
@@ -22,46 +22,71 @@ class config:
     DROP_OLD = env.bool("DROP_OLD", True)
     USE_SHUFFLED_DATA = env.bool("USE_SHUFFLED_DATA", True)
 
-    NUM_CONCURRENCY = env.list("NUM_CONCURRENCY",  [1, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100], subcast=int )
+    NUM_CONCURRENCY = env.list(
+        "NUM_CONCURRENCY",
+        [
+            1,
+            5,
+            10,
+            15,
+            20,
+            25,
+            30,
+            35,
+            40,
+            45,
+            50,
+            55,
+            60,
+            65,
+            70,
+            75,
+            80,
+            85,
+            90,
+            95,
+            100,
+        ],
+        subcast=int,
+    )
 
     CONCURRENCY_DURATION = 30
 
     RESULTS_LOCAL_DIR = env.path(
-        "RESULTS_LOCAL_DIR", pathlib.Path(__file__).parent.joinpath("results")
+        "RESULTS_LOCAL_DIR",
+        pathlib.Path(__file__).parent.joinpath("results"),
     )
     CONFIG_LOCAL_DIR = env.path(
-        "CONFIG_LOCAL_DIR", pathlib.Path(__file__).parent.joinpath("config-files")
+        "CONFIG_LOCAL_DIR",
+        pathlib.Path(__file__).parent.joinpath("config-files"),
     )
 
-
     K_DEFAULT = 100  # default return top k nearest neighbors during search
     CUSTOM_CONFIG_DIR = pathlib.Path(__file__).parent.joinpath("custom/custom_case.json")
 
-    CAPACITY_TIMEOUT_IN_SECONDS = 24 * 3600   # 24h
-    LOAD_TIMEOUT_DEFAULT        = 24 * 3600   # 24h
-    LOAD_TIMEOUT_768D_1M        = 24 * 3600   # 24h
-    LOAD_TIMEOUT_768D_10M       = 240 * 3600  # 10d
-    LOAD_TIMEOUT_768D_100M      = 2400 * 3600 # 100d
+    CAPACITY_TIMEOUT_IN_SECONDS = 24 * 3600  # 24h
+    LOAD_TIMEOUT_DEFAULT = 24 * 3600  # 24h
+    LOAD_TIMEOUT_768D_1M = 24 * 3600  # 24h
+    LOAD_TIMEOUT_768D_10M = 240 * 3600  # 10d
+    LOAD_TIMEOUT_768D_100M = 2400 * 3600  # 100d
 
-    LOAD_TIMEOUT_1536D_500K     = 24 * 3600   # 24h
-    LOAD_TIMEOUT_1536D_5M       = 240 * 3600  # 10d
+    LOAD_TIMEOUT_1536D_500K = 24 * 3600  # 24h
+    LOAD_TIMEOUT_1536D_5M = 240 * 3600  # 10d
 
-    OPTIMIZE_TIMEOUT_DEFAULT    = 24 * 3600   # 24h
-    OPTIMIZE_TIMEOUT_768D_1M    = 24 * 3600   # 24h
-    OPTIMIZE_TIMEOUT_768D_10M   = 240 * 3600  # 10d
-    OPTIMIZE_TIMEOUT_768D_100M  = 2400 * 3600 # 100d
+    OPTIMIZE_TIMEOUT_DEFAULT = 24 * 3600  # 24h
+    OPTIMIZE_TIMEOUT_768D_1M = 24 * 3600  # 24h
+    OPTIMIZE_TIMEOUT_768D_10M = 240 * 3600  # 10d
+    OPTIMIZE_TIMEOUT_768D_100M = 2400 * 3600  # 100d
 
+    OPTIMIZE_TIMEOUT_1536D_500K = 24 * 3600  # 24h
+    OPTIMIZE_TIMEOUT_1536D_5M = 240 * 3600  # 10d
 
-    OPTIMIZE_TIMEOUT_1536D_500K = 24 * 3600   # 24h
-    OPTIMIZE_TIMEOUT_1536D_5M   = 240 * 3600  # 10d
-
     def display(self) -> str:
-        tmp = [
-            i for i in inspect.getmembers(self)
-            if not inspect.ismethod(i[1])
-            and not i[0].startswith('_')
-            and "TIMEOUT" not in i[0]
+        return [
+            i
+            for i in inspect.getmembers(self)
+            if not inspect.ismethod(i[1]) and not i[0].startswith("_") and "TIMEOUT" not in i[0]
         ]
-        return tmp
+
 
 log_util.init(config.LOG_LEVEL)
diff --git a/vectordb_bench/__main__.py b/vectordb_bench/__main__.py
@@ -1,7 +1,8 @@
-import traceback
 import logging
+import pathlib
 import subprocess
-import os
+import traceback
+
 from . import config
 
 log = logging.getLogger("vectordb_bench")
@@ -16,7 +17,7 @@ def run_streamlit():
     cmd = [
         "streamlit",
         "run",
-        f"{os.path.dirname(__file__)}/frontend/vdb_benchmark.py",
+        f"{pathlib.Path(__file__).parent}/frontend/vdb_benchmark.py",
         "--logger.level",
         "info",
         "--theme.base",

diff --git a/vectordb_bench/backend/assembler.py b/vectordb_bench/backend/assembler.py
@@ -1,33 +1,32 @@
-from .cases import CaseLabel
-from .task_runner import CaseRunner, RunningStatus, TaskRunner
-from ..models import TaskConfig
-from ..backend.clients import EmptyDBCaseConfig
-from ..backend.data_source  import DatasetSource
 import logging
 
+from vectordb_bench.backend.clients import EmptyDBCaseConfig
+from vectordb_bench.backend.data_source import DatasetSource
+from vectordb_bench.models import TaskConfig
+
+from .cases import CaseLabel
+from .task_runner import CaseRunner, RunningStatus, TaskRunner
 
 log = logging.getLogger(__name__)
 
 
 class Assembler:
     @classmethod
-    def assemble(cls, run_id , task: TaskConfig, source: DatasetSource) -> CaseRunner:
+    def assemble(cls, run_id: str, task: TaskConfig, source: DatasetSource) -> CaseRunner:
         c_cls = task.case_config.case_id.case_cls
 
         c = c_cls(task.case_config.custom_case)
-        if type(task.db_case_config) != EmptyDBCaseConfig:
+        if type(task.db_case_config) is not EmptyDBCaseConfig:
             task.db_case_config.metric_type = c.dataset.data.metric_type
 
-        runner = CaseRunner(
+        return CaseRunner(
             run_id=run_id,
             config=task,
             ca=c,
             status=RunningStatus.PENDING,
             dataset_source=source,
         )
 
-        return runner
-
     @classmethod
     def assemble_all(
         cls,
@@ -50,12 +49,12 @@ def assemble_all(
             db2runner[db].append(r)
 
         # check dbclient installed
-        for k in db2runner.keys():
+        for k in db2runner:
             _ = k.init_cls
 
         # sort by dataset size
-        for k in db2runner.keys():
-            db2runner[k].sort(key=lambda x:x.ca.dataset.data.size)
+        for k, _ in db2runner:
+            db2runner[k].sort(key=lambda x: x.ca.dataset.data.size)
 
         all_runners = []
         all_runners.extend(load_runners)