vaexio · maartenbreddels · Jul 10, 2023 · Feb 14, 2023 · Feb 14, 2023 · Jul 7, 2023
diff --git a/.github/workflows/pythonpackage.yml b/.github/workflows/pythonpackage.yml
@@ -30,7 +30,7 @@ jobs:
       fail-fast: false
       max-parallel: 12
       matrix:
-        os: [ubuntu-latest, windows-latest, macOS-latest]
+        os: [ubuntu-latest, windows-latest, macOS-11]
         python-version: [3.6, 3.7, 3.8, 3.9]
 
     steps:
@@ -40,7 +40,7 @@ jobs:
         df -h
     - uses: maxim-lobanov/setup-xcode@v1
       # alternative would be to upgrade tapi to 1100.0.11, we can possibly remove this in the future
-      if: matrix.os == 'macOS-latest'
+      if: matrix.os == 'macOS-11'
       with:
         xcode-version: "11.7.0"
     - uses: actions/checkout@v2
@@ -79,12 +79,12 @@ jobs:
     #     docker rmi $(docker image ls -aq)
     #     df -h
     - name: Install OpenMP runtime (Mac-only)
-      if: matrix.os == 'macOS-latest'
+      if: matrix.os == 'macOS-11'
       run: |
         brew install libomp
     - name: Cache compiled binaries
       # this fails for this combination, leading to binaries filled with 0's
-      if: matrix.python-version != '3.6' || matrix.os != 'macOS-latest'
+      if: matrix.python-version != '3.6' || matrix.os != 'macOS-11'
       id: cache-compiled-binaries
       uses: actions/cache@v2
       with:
@@ -169,15 +169,15 @@ jobs:
       run: |
         ./ci/05-run-notebooks.sh
     - name: Authenticate Google Cloud Platform
-      if: ${{ (github.event.pull_request.head.repo.full_name == 'vaexio/vaex') && !((matrix.os == 'windows-latest') || (matrix.os == 'macOS-latest' && matrix.python-version == '3.6'))  }}
+      if: ${{ (github.event.pull_request.head.repo.full_name == 'vaexio/vaex') && !((matrix.os == 'windows-latest') || (matrix.os == 'macOS-11' && matrix.python-version == '3.6'))  }}
       uses: google-github-actions/setup-gcloud@v0
       with:
         project_id: ${{ secrets.GCP_PROJECT_ID_VAEX }}
         service_account_key: ${{ secrets.GCP_SA_KEY_VAEX }}
         export_default_credentials: true
     - name: Test vaex-contrib
       # do not run in a PR from someone else, skip windows, and osx+py36
-      if: ${{ (github.event.pull_request.head.repo.full_name == 'vaexio/vaex') && !((matrix.os == 'windows-latest') || (matrix.os == 'macOS-latest' && matrix.python-version == '3.6'))  }}
+      if: ${{ (github.event.pull_request.head.repo.full_name == 'vaexio/vaex') && !((matrix.os == 'windows-latest') || (matrix.os == 'macOS-11' && matrix.python-version == '3.6'))  }}
       env:
         PROJECT_ID: ${{ secrets.GCP_PROJECT_ID_VAEX }}
       run: |
@@ -191,7 +191,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        os: [ubuntu-latest, windows-latest, macOS-latest]
+        os: [ubuntu-latest, windows-latest, macOS-11]
         python-version: [3.7, 3.8, 3.9]
         # ssl/certifi issues with this combination
         exclude:

diff --git a/.github/workflows/wheel.yml b/.github/workflows/wheel.yml
@@ -12,7 +12,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        os: [ubuntu-18.04, macOS-latest, windows-latest]
+        os: [ubuntu-18.04, macOS-11, windows-latest]
         python-version: [36, 37, 38, 39, 310]
 
     steps:

diff --git a/ci/conda-env.yml b/ci/conda-env.yml
@@ -11,10 +11,12 @@ dependencies:
 - cachetools
 - catboost
 - diskcache
+- filelock
 - fsspec<2022.2.0
 - gcsfs
 - geopandas
 - graphviz
+- joblib<1.3  # 1.3 is broken for py36
- joblib<1.3  # 1.3 is broken for py36
+- joblib<1.3; python_version <= '3.6'
- joblib<1.3  # 1.3 is broken for py36
+- joblib<1.3; python_version <= '3.6'
 - h5py
 - httpx # for testing with starlette/fastapi
 - ipyvolume=0.6.0a6
@@ -37,12 +39,18 @@ dependencies:
 - pytest-timeout
 - python-graphviz
 - py-xgboost
+- pyyaml
 - rich
 - s3fs>=2021.8.0
 - scikit-learn
 - scipy
+- tabulate
 - tornado
 - uvicorn<0.16
 - xarray<2022.6.0
 # currently not using this, since the test that requires this is flakey
 # - myst-parser<0.18  # 0.18 breaks our test, missing main
+- pytz  # for some reason, pip has trouble resolving this, and the libraries below
+- python-utils
+- progressbar2
+- zipp<3.16.0
diff --git a/packages/vaex-core/vaex/config.py b/packages/vaex-core/vaex/config.py
@@ -1,16 +1,7 @@
 import os
-from pydantic import BaseSettings
 import vaex.utils
+import pydantic
 
 
-# in seperate module to avoid circular imports
-
 class ConfigDefault:
-    env_file = ".env"
-    @classmethod
-    def customise_sources(cls, init_settings, env_settings, file_secret_settings):
-        return (
-            env_settings,
-            file_secret_settings,
-            init_settings,  # constructor argument last, since they come from global yaml file
-        )
+    pass
diff --git a/packages/vaex-core/vaex/minisettings.py b/packages/vaex-core/vaex/minisettings.py
@@ -0,0 +1,130 @@
+import os
+from pathlib import Path
+from typing import Any, Optional
+
+# similar API to pydantic/pydantic-settings but we prefer not to have a dependency on pydantic
+# since we cannot be compatible with pydantic1 and 2
+# NOTE: not a public api
+
+
+def _get_type(annotation):
+    check_optional_types = [str, int, float, bool, dict, list]
+    for check_type in check_optional_types:
+        if annotation == Optional[check_type]:
+            return check_type
+    if hasattr(annotation, "__origin__"):
+        if annotation.__origin__ == dict:
+            return dict
+    return annotation
+
+
+class _Field:
+    def __init__(self, default=None, env=None, title=None, default_factory=None, gt=None, alias=None) -> None:
+        self.default = default
+        self.env = env
+        self.fullenv = None
+        self.title = title
+        self.annotation = None
+        self.default_factory = default_factory
+        self.gt = gt
+        self.alias = alias
+        self.field_info = self
+        self.extra = {"env_names": [env] if env else []}
+
+    def __set_name__(self, owner, name):
+        prefix = "SOLARA_"
+        config = getattr(owner, "Config")
+        if config:
+            prefix = getattr(config, "env_prefix", prefix).upper()
+            if hasattr(config, "fields"):
+                fields = config.fields
+                if name in fields:
+                    self.alias = fields[name]
+        self.name = name
+        self.alias = self.alias or self.name
+        self.title = self.title or self.name
+        if self.env is None:
+            self.env = f"{prefix}{self.name.upper()}"
+        else:
+            self.env = self.env
+        self.annotation = owner.__annotations__.get(self.name)
+        assert self.annotation is not None, f"Field {self.name} must have a type annotation"
+        self.type_ = _get_type(self.annotation)
+
+    def __get__(self, instance, owner):
+        if instance is None:
+            return self
+        return instance._values[self.name]
+
+
+def convert(annotation, value: str) -> Any:
+    check_optional_types = [str, int, float, bool, Path]
+    for check_type in check_optional_types:
+        if annotation == Optional[check_type]:
+            annotation = check_type
+            return convert(annotation, value)
+    if annotation == str:
+        return value
+    elif annotation == int:
+        return int(value)
+    elif annotation == float:
+        return float(value)
+    elif annotation == bool:
+        if value in ("True", "true", "1"):
+            return True
+        elif value in ("False", "false", "0"):
+            return False
+        else:
+            raise ValueError(f"Invalid boolean value {value}")
+    else:
+        # raise TypeError(f"Unsupported type {annotation}")
+        return annotation(value)
+
+
+def Field(*args, **kwargs) -> Any:
+    return _Field(*args, **kwargs)
+
+
+class BaseSettings:
+    __fields__: dict
+
+    def __init__(self, **kwargs) -> None:
+        cls = type(self)
+        self._values = {**kwargs}
+        keys = set([k.upper() for k in os.environ.keys()])
+        for key, field in cls.__dict__.items():
+            if key in kwargs:
+                continue
+            if isinstance(field, _Field):
+                value = field.default
+                if field.default_factory:
+                    value = field.default_factory()
+
+                if field.env:
+                    env_key = field.env.upper()
+                    if env_key in keys:
+                        # do a case-insensitive lookup
+                        for env_var_cased in os.environ.keys():
+                            if env_key.upper() == env_var_cased.upper():
+                                value = convert(field.annotation, os.environ[env_var_cased])
+                self._values[key] = value
+
+    def __init_subclass__(cls) -> None:
+        cls.__fields__ = {}
+        for key, field in cls.__dict__.items():
+            if key.startswith("_"):
+                continue
+            if key == "Config":
+                continue
+            if not isinstance(field, _Field):
+                field = Field(field)
+                setattr(cls, key, field)
+                field.__set_name__(cls, key)
+            cls.__fields__[key] = field
+
+    def dict(self, by_alias=True):
+        values = self._values.copy()
+        for key, value in values.items():
+            if isinstance(value, BaseSettings):
+                values[key] = value.dict(by_alias=by_alias)
+        return values
diff --git a/packages/vaex-core/vaex/settings.py b/packages/vaex-core/vaex/settings.py
@@ -7,22 +7,27 @@
 import multiprocessing
 import sys
 
-from pydantic import BaseModel, BaseSettings, Field
-from typing import List, Union, Optional, Dict
+from pydantic import BaseModel, Field
+import pydantic
+from typing import Any, List, Union, Optional, Dict
 from enum import Enum
-from .config import ConfigDefault
 
-logger = logging.getLogger("vaex.settings")
-_default_home = vaex.utils.get_vaex_home()
 try:
     import dotenv
     has_dotenv = True
 except:
     has_dotenv = False
-if has_dotenv:
-    from pydantic.env_settings import read_env_file
-    envs = read_env_file(ConfigDefault.env_file)
-    _default_home = envs.get('vaex_home', _default_home)
+
+
+logger = logging.getLogger("vaex.settings")
+_default_home = vaex.utils.get_vaex_home()
+
+
+class ConfigDefault:
+    pass
+
+
+from .minisettings import BaseSettings, Field
 
 # we may want to use this
 # class ByteAmount(str):
@@ -77,7 +82,7 @@ class Config(ConfigDefault):
 
 class Chunk(BaseSettings):
     """Configure how a dataset is broken down in smaller chunks. The executor dynamically adjusts the chunk size based on `size_min` and `size_max` and the number of threads when `size` is not set."""
-    size: Optional[int] = Field(title="When set, fixes the number of chunks, e.g. do not dynamically adjust between min and max")
+    size: Optional[int] = Field(None, title="When set, fixes the number of chunks, e.g. do not dynamically adjust between min and max")
     size_min: int = Field(1024, title="Minimum chunk size")
     size_max: int = Field(1024**2, title="Maximum chunk size")
     class Config(ConfigDefault):
@@ -126,7 +131,7 @@ class Config(ConfigDefault):
 class Progress(BaseSettings):
     """Data configuration"""
     type: str = Field('simple', title="Default progressbar to show: 'simple', 'rich' or 'widget'")
-    force: str = Field(None, title="Force showing a progress bar of this type, even when no progress bar was requested from user code", env="VAEX_PROGRESS")
+    force: Optional[str] = Field(None, title="Force showing a progress bar of this type, even when no progress bar was requested from user code", env="VAEX_PROGRESS")
 
     class Config(ConfigDefault):
         env_prefix = 'vaex_progress_'
@@ -152,27 +157,27 @@ class Config(ConfigDefault):
 class Settings(BaseSettings):
     """General settings for vaex"""
     aliases: Optional[dict] = Field(title='Aliases to be used for vaex.open', default_factory=dict)
-    async_: AsyncEnum = Field('nest', env='VAEX_ASYNC', title="How to run async code in the local executor", min_length=2)
+    async_: AsyncEnum = Field('nest', env='VAEX_ASYNC', title="How to run async code in the local executor")
     home: str = Field(_default_home, env="VAEX_HOME", title="Home directory for vaex, which defaults to `$HOME/.vaex`, "\
         " If both `$VAEX_HOME` and `$HOME` are not defined, the current working directory is used. (Note that this setting cannot be configured from the vaex home directory itself).")
     mmap: bool = Field(True, title="Experimental to turn off, will avoid using memory mapping if set to False")
-    process_count: Optional[int] = Field(title="Number of processes to use for multiprocessing (e.g. apply), defaults to thread_count setting", gt=0)
-    thread_count: Optional[int] = Field(env='VAEX_NUM_THREADS', title="Number of threads to use for computations, defaults to multiprocessing.cpu_count()", gt=0)
-    thread_count_io: Optional[int] = Field(env='VAEX_NUM_THREADS_IO', title="Number of threads to use for IO, defaults to thread_count_io + 1", gt=0)
+    process_count: Optional[int] = Field(None, title="Number of processes to use for multiprocessing (e.g. apply), defaults to thread_count setting", gt=0)
+    thread_count: Optional[int] = Field(None, env='VAEX_NUM_THREADS', title="Number of threads to use for computations, defaults to multiprocessing.cpu_count()", gt=0)
+    thread_count_io: Optional[int] = Field(None, env='VAEX_NUM_THREADS_IO', title="Number of threads to use for IO, defaults to thread_count_io + 1", gt=0)
     path_lock: str = Field(os.path.join(_default_home, "lock"), env="VAEX_LOCK", title="Directory to store lock files for vaex, which defaults to `${VAEX_HOME}/lock/`, "\
         " Due to possible race conditions lock files cannot be removed while processes using Vaex are running (on Unix systems).")
 
 
     # avoid name collisions of VAEX_CACHE with configurting the whole object via json in env var
-    cache = Field(Cache(), env='_VAEX_CACHE')
+    cache: Cache = Field(Cache(), env='_VAEX_CACHE')
     chunk: Chunk = Field(Chunk(), env='_VAEX_CHUNK')
-    data = Field(Data(), env='_VAEX_DATA')
+    data: Data = Field(Data(), env='_VAEX_DATA')
     display: Display = Field(Display(), env='_VAEX_DISPLAY')
     fs: FileSystem = Field(FileSystem(), env='_VAEX_FS')
-    memory_tracker = Field(MemoryTracker(), env='_VAEX_MEMORY_TRACKER')
-    task_tracker = Field(TaskTracker(), env='_VAEX_TASK_TRACKER')
-    logging = Field(Logging(), env="_VAEX_LOGGING")
-    progress = Field(Progress(), env="_VAEX_PROGRESS")
+    memory_tracker: MemoryTracker = Field(MemoryTracker(), env='_VAEX_MEMORY_TRACKER')
+    task_tracker: TaskTracker = Field(TaskTracker(), env='_VAEX_TASK_TRACKER')
+    logging: Logging = Field(Logging(), env="_VAEX_LOGGING")
+    progress: Progress = Field(Progress(), env="_VAEX_PROGRESS")
 
     if has_server:
         server: vaex.server.settings.Settings = vaex.server.settings.Settings()

diff --git a/packages/vaex-jupyter/vaex/jupyter/widgets.py b/packages/vaex-jupyter/vaex/jupyter/widgets.py
@@ -1,13 +1,17 @@
 from __future__ import absolute_import
+
+import os
+
 import ipyvuetify as v
 import ipywidgets as widgets
 import traitlets
 from traitlets import *  # noqa
-from . import traitlets as vt
-import os
+from traitlets import Dict, observe
 
 import vaex.jupyter
 
+from . import traitlets as vt
+
 
 def load_template(filename):
     with open(os.path.join(os.path.dirname(__file__), filename)) as f:
@@ -558,6 +562,7 @@ def _template(self):
 import ipyvuetify as v
 import traitlets
 
+
 class SettingsEditor(v.VuetifyTemplate):
     template_file = os.path.join(os.path.dirname(__file__), "vue/vjsf.vue")
 

diff --git a/packages/vaex-ml/vaex/ml/transformations.py b/packages/vaex-ml/vaex/ml/transformations.py
@@ -243,7 +243,7 @@ def _valid_eps(self, proposal):
 
     @traitlets.validate('density')
     def _valid_density(self, proposal):
-        if (proposal['value'] > 0) & (proposal['value'] <= 1):
+        if proposal['value'] is None or (proposal['value'] > 0) & (proposal['value'] <= 1):
             return proposal['value']
         else:
             raise traitlets.TraitError('`density` must be 0 < density <= 1.')