From 685e0b2de971263a60a55727c90f408da60e009d Mon Sep 17 00:00:00 2001 From: Rick Izzo Date: Tue, 30 Apr 2019 18:22:05 -0400 Subject: [PATCH 1/2] Allow single process access to multiple repos. These changes modify the behavior of the Environment and hdf5 FileHandler singleton classes to instantiate singleton instances based on the path of the checked out repository. This seems to fix all outstanding bugs related to these odd behaviors, but is untested as of yet. The TransactionRegister singleton did not need to be updated because it the reference counting scheme is isolated for each lmdb environment which is passed to it. As different repository instances in the same process will now have unique lmdb environment handles, it does not require any modifications to it's current behavior. --- src/hangar/context.py | 7 ++++--- src/hangar/dataset.py | 5 +++-- src/hangar/hdf5_store.py | 11 ++++++----- src/hangar/remote/hangar_client.py | 2 +- 4 files changed, 14 insertions(+), 11 deletions(-) diff --git a/src/hangar/context.py b/src/hangar/context.py index 3a72298d..8486a235 100644 --- a/src/hangar/context.py +++ b/src/hangar/context.py @@ -175,9 +175,10 @@ def abort_reader_txn(self, lmdbenv: lmdb.Environment) -> bool: class EnvironmentsSingleton(type): _instances = {} def __call__(cls, *args, **kwargs): - if cls not in cls._instances: - cls._instances[cls] = super(EnvironmentsSingleton, cls).__call__(*args, **kwargs) - return cls._instances[cls] + repo_path = kwargs['repo_path'] + if repo_path not in cls._instances: + cls._instances[repo_path] = super(EnvironmentsSingleton, cls).__call__(*args, **kwargs) + return cls._instances[repo_path] class Environments(metaclass=EnvironmentsSingleton): diff --git a/src/hangar/dataset.py b/src/hangar/dataset.py index 0af8205e..ca3c69cf 100644 --- a/src/hangar/dataset.py +++ b/src/hangar/dataset.py @@ -79,7 +79,7 @@ def __init__(self, self._index_expr_factory.maketuple = False self._schema_dtype_num = varDtypeNum self._mode = mode - self._fs = FileHandles() + self._fs = FileHandles(repo_path=repo_pth) self._Query = RecordQuery(self._dataenv) self._TxnRegister = TxnRegister() @@ -925,7 +925,8 @@ def init_dataset(self, name: str, shape=None, dtype=None, prototype=None, *, STAGE_DATA_DIR = config.get('hangar.repository.stage_data_dir') stage_dir = os.path.join(self._repo_pth, STAGE_DATA_DIR, f'hdf_{schema_hash}') if not os.path.isdir(stage_dir): - f_handle = FileHandles().create_schema(self._repo_pth, schema_hash, prototype) + f_handle = FileHandles(repo_path=self._repo_pth).create_schema( + self._repo_pth, schema_hash, prototype) f_handle.close() # -------- set vals in lmdb only after schema is sure to exist -------- diff --git a/src/hangar/hdf5_store.py b/src/hangar/hdf5_store.py index 5d1c3728..ed9cb3c4 100644 --- a/src/hangar/hdf5_store.py +++ b/src/hangar/hdf5_store.py @@ -16,11 +16,11 @@ class FileHandlesSingleton(type): _instances = {} - def __call__(cls, *args, **kwargs): - if cls not in cls._instances: - cls._instances[cls] = super(FileHandlesSingleton, cls).__call__(*args, **kwargs) - return cls._instances[cls] + repo_pth = kwargs['repo_path'] + if repo_pth not in cls._instances: + cls._instances[repo_pth] = super(FileHandlesSingleton, cls).__call__(*args, **kwargs) + return cls._instances[repo_pth] ''' @@ -37,7 +37,8 @@ class FileHandles(metaclass=FileHandlesSingleton): write to the same dataset schema. ''' - def __init__(self): + def __init__(self, repo_path): + self.repo_path = repo_path self.rHands = {} self.wHands = {} self.hMaxSize = {} diff --git a/src/hangar/remote/hangar_client.py b/src/hangar/remote/hangar_client.py index 16e6758f..35b2474a 100644 --- a/src/hangar/remote/hangar_client.py +++ b/src/hangar/remote/hangar_client.py @@ -50,7 +50,7 @@ def __init__(self, auth_username: str = '', auth_password: str = ''): self.env = envs - self.fs = FileHandles() + self.fs = FileHandles(repo_path=self.env.repo_path) self.fs.open(self.env.repo_path, 'r') self.header_adder_int = header_adder_interceptor(auth_username, auth_password) From 9704809f70a28ff73392021e5bc59385ee63e751 Mon Sep 17 00:00:00 2001 From: Rick Izzo Date: Wed, 1 May 2019 11:30:12 -0400 Subject: [PATCH 2/2] Validate path provided on Repository Instantiation If this is not a pathlike object, a directory on disk, or a directory which the user agent does not have write permissions for then an exception will be thrown and the no repository object will be instantiated. --- src/hangar/repository.py | 13 ++++++++---- src/hangar/utils.py | 43 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 52 insertions(+), 4 deletions(-) diff --git a/src/hangar/repository.py b/src/hangar/repository.py index a0b4eb82..3e29153e 100644 --- a/src/hangar/repository.py +++ b/src/hangar/repository.py @@ -13,7 +13,7 @@ from .diagnostics import graphing from .records import heads, parsing, summarize, commiting from .remote.hangar_client import HangarClient - +from .utils import is_valid_directory_path logger = logging.getLogger(__name__) @@ -35,9 +35,14 @@ class Repository(object): ''' def __init__(self, path): - path = os.path.join(path, config.get('hangar.repository.hangar_dir_name')) - path = os.path.expanduser(path) - self._env = Environments(repo_path=path) + + try: + usr_path = is_valid_directory_path(path) + except (TypeError, OSError, PermissionError) as e: + logger.error(e, exc_info=False) + raise + repo_pth = os.path.join(usr_path, config.get('hangar.repository.hangar_dir_name')) + self._env = Environments(repo_path=repo_pth) self._repo_path = self._env.repo_path self._client: Optional[HangarClient] = None diff --git a/src/hangar/utils.py b/src/hangar/utils.py index 207b1d1e..c2e11fb4 100644 --- a/src/hangar/utils.py +++ b/src/hangar/utils.py @@ -133,6 +133,49 @@ def folder_size(repo_path, *, recurse_directories=False): return total +def is_valid_directory_path(path: str) -> str: + '''Check if path is directory which user has write permission to. + + Parameters + ---------- + path : str + path to some location on disk + + Returns + ------- + str + If successful, the path with any user constructions expanded + (ie. `~/somedir` -> `/home/foo/somedir`) + + Raises + ------ + TypeError + If the provided path argument is not a pathlike object + OSError + If the path does not exist, or is not a directory on disk + PermissionError + If the user does not have write access to the specified path + ''' + try: + usr_path = os.path.expanduser(path) + isDir = os.path.isdir(usr_path) + isWriteable = os.access(usr_path, os.W_OK) + except TypeError: + msg = f'HANGAR TYPE ERROR:: `path` arg: {path} of type: {type(path)} '\ + f'is not valid path specifier' + raise TypeError(msg) + + if not isDir: + msg = f'HANGAR VALUE ERROR:: `path` arg: {path} is not a directory.' + raise OSError(msg) + elif not isWriteable: + msg = f'HANGAR PERMISSION ERROR:: user does not have permission to write '\ + f'to directory `path` arg: {path}' + raise PermissionError(msg) + + return usr_path + + ''' Methods following this notice have been taken & modified from the Dask Distributed project url: https://github.com/dask/distributed