From d2e0b4420846a071121f80440e81d5ca0176682c Mon Sep 17 00:00:00 2001 From: Ruslan Kuprieiev Date: Fri, 15 Dec 2023 02:14:09 +0200 Subject: [PATCH] fs: replace fs.path with plain fs methods After such a long time using it, it is clear that it wasn't the right decision and path manipulation methods should be right in the class. This also makes it possible to use most of methods as classmethods without having to initialize the filesystem at all. Per https://github.com/fsspec/filesystem_spec/issues/747#issuecomment-1296356031 and also a pre-requisite for it. --- src/dvc_objects/db.py | 15 ++-- src/dvc_objects/fs/base.py | 160 ++++++++++++++++++++++++++++++---- src/dvc_objects/fs/generic.py | 10 +-- src/dvc_objects/fs/local.py | 18 ++-- src/dvc_objects/fs/path.py | 156 --------------------------------- src/dvc_objects/fs/utils.py | 6 +- tests/fs/test_localfs.py | 6 +- 7 files changed, 172 insertions(+), 199 deletions(-) delete mode 100644 src/dvc_objects/fs/path.py diff --git a/src/dvc_objects/db.py b/src/dvc_objects/db.py index fb28a85..7d80966 100644 --- a/src/dvc_objects/db.py +++ b/src/dvc_objects/db.py @@ -63,14 +63,13 @@ def _init(self, dname: str) -> None: self._dirs = set() with suppress(FileNotFoundError, NotImplementedError): self._dirs = { - self.fs.path.name(path) - for path in self.fs.ls(self.path, detail=False) + self.fs.name(path) for path in self.fs.ls(self.path, detail=False) } if dname in self._dirs: return - self.makedirs(self.fs.path.join(self.path, dname)) + self.makedirs(self.fs.join(self.path, dname)) self._dirs.add(dname) def exists(self, oid: str) -> bool: @@ -198,7 +197,7 @@ def _oid_parts(self, oid: str) -> Tuple[str, str]: return oid[:2], oid[2:] def oid_to_path(self, oid) -> str: - return self.fs.path.join(self.path, *self._oid_parts(oid)) + return self.fs.join(self.path, *self._oid_parts(oid)) def _list_prefixes( self, @@ -216,12 +215,12 @@ def _list_prefixes( yield from self.fs.find(paths, batch_size=jobs, prefix=prefix) def path_to_oid(self, path) -> str: - if self.fs.path.isabs(path): - self_path = self.fs.path.abspath(self.path) + if self.fs.isabs(path): + self_path = self.fs.abspath(self.path) else: self_path = self.path - self_parts = self.fs.path.parts(self_path) - parts = self.fs.path.parts(path)[len(self_parts) :] + self_parts = self.fs.parts(self_path) + parts = self.fs.parts(path)[len(self_parts) :] if not (len(parts) == 2 and parts[0] and len(parts[0]) == 2): raise ValueError(f"Bad cache file path '{path}'") diff --git a/src/dvc_objects/fs/base.py b/src/dvc_objects/fs/base.py index b12297f..6acf1c6 100644 --- a/src/dvc_objects/fs/base.py +++ b/src/dvc_objects/fs/base.py @@ -1,7 +1,9 @@ import asyncio import datetime import logging +import ntpath import os +import posixpath import shutil from functools import partial from multiprocessing import cpu_count @@ -11,15 +13,18 @@ Any, ClassVar, Dict, + Iterable, Iterator, List, Literal, Optional, + Sequence, Tuple, Union, cast, overload, ) +from urllib.parse import urlsplit, urlunsplit from fsspec.asyn import get_loop @@ -40,8 +45,6 @@ from fsspec.spec import AbstractFileSystem - from .path import Path - logger = logging.getLogger(__name__) @@ -68,6 +71,7 @@ def __init__(self, link: str, fs: "FileSystem", path: str) -> None: class FileSystem: sep = "/" + flavour = posixpath protocol = "base" REQUIRES: ClassVar[Dict[str, str]] = {} _JOBS = 4 * cpu_count() @@ -104,14 +108,138 @@ def config(self) -> Dict[str, Any]: def root_marker(self) -> str: return self.fs.root_marker - @cached_property - def path(self) -> "Path": - from .path import Path + def getcwd(self) -> str: + return "" + + def chdir(self, path: str): + raise NotImplementedError + + @classmethod + def join(cls, *parts: str) -> str: + return cls.flavour.join(*parts) + + @classmethod + def split(cls, path: str) -> Tuple[str, str]: + return cls.flavour.split(path) + + @classmethod + def splitext(cls, path: str) -> Tuple[str, str]: + return cls.flavour.splitext(path) + + def normpath(self, path: str) -> str: + if self.flavour == ntpath: + return self.flavour.normpath(path) + + parts = list(urlsplit(path)) + parts[2] = self.flavour.normpath(parts[2]) + return urlunsplit(parts) + + @classmethod + def isabs(cls, path: str) -> bool: + return cls.flavour.isabs(path) + + def abspath(self, path: str) -> str: + if not self.isabs(path): + path = self.join(self.getcwd(), path) + return self.normpath(path) + + @classmethod + def commonprefix(cls, paths: Sequence[str]) -> str: + return cls.flavour.commonprefix(paths) + + @classmethod + def commonpath(cls, paths: Iterable[str]) -> str: + return cls.flavour.commonpath(list(paths)) + + @classmethod + def parts(cls, path: str) -> Tuple[str, ...]: + drive, path = cls.flavour.splitdrive(path.rstrip(cls.flavour.sep)) + + ret = [] + while True: + path, part = cls.flavour.split(path) + + if part: + ret.append(part) + continue + + if path: + ret.append(path) + + break + + ret.reverse() + + if drive: + ret = [drive, *ret] - def _getcwd(): - return self.fs.root_marker + return tuple(ret) - return Path(self.sep, getcwd=_getcwd) + @classmethod + def parent(cls, path: str) -> str: + return cls.flavour.dirname(path) + + @classmethod + def dirname(cls, path: str) -> str: + return cls.parent(path) + + @classmethod + def parents(cls, path: str) -> Iterator[str]: + while True: + parent = cls.flavour.dirname(path) + if parent == path: + break + yield parent + path = parent + + @classmethod + def name(cls, path: str) -> str: + return cls.flavour.basename(path) + + @classmethod + def suffix(cls, path: str) -> str: + name = cls.name(path) + _, dot, suffix = name.partition(".") + return dot + suffix + + @classmethod + def with_name(cls, path: str, name: str) -> str: + return cls.join(cls.parent(path), name) + + @classmethod + def with_suffix(cls, path: str, suffix: str) -> str: + return cls.splitext(path)[0] + suffix + + @classmethod + def isin(cls, left: str, right: str) -> bool: + if left == right: + return False + try: + common = cls.commonpath([left, right]) + except ValueError: + # Paths don't have the same drive + return False + return common == right + + @classmethod + def isin_or_eq(cls, left: str, right: str) -> bool: + return left == right or cls.isin(left, right) + + @classmethod + def overlaps(cls, left: str, right: str) -> bool: + return cls.isin_or_eq(left, right) or cls.isin(right, left) + + def relpath(self, path: str, start: Optional[str] = None) -> str: + if start is None: + start = "." + return self.flavour.relpath(self.abspath(path), start=self.abspath(start)) + + def relparts(self, path: str, start: Optional[str] = None) -> Tuple[str, ...]: + return self.parts(self.relpath(path, start=start)) + + @classmethod + def as_posix(cls, path: str) -> str: + return path.replace(cls.flavour.sep, posixpath.sep) @classmethod def _strip_protocol(cls, path: str) -> str: @@ -299,7 +427,7 @@ def checksum(self, path: AnyFSPath) -> str: return self.fs.checksum(path) def copy(self, from_info: AnyFSPath, to_info: AnyFSPath) -> None: - self.makedirs(self.path.parent(to_info)) + self.makedirs(self.parent(to_info)) self.fs.copy(from_info, to_info) def cp_file(self, from_info: AnyFSPath, to_info: AnyFSPath, **kwargs: Any) -> None: @@ -515,7 +643,7 @@ def put_file( else: assert isinstance(from_file, str) self.fs.put_file(os.fspath(from_file), to_info, callback=callback, **kwargs) - self.fs.invalidate_cache(self.path.parent(to_info)) + self.fs.invalidate_cache(self.parent(to_info)) def get_file( self, @@ -527,7 +655,7 @@ def get_file( self.fs.get_file(from_info, to_info, callback=callback, **kwargs) def upload_fobj(self, fobj: IO, to_info: AnyFSPath, **kwargs) -> None: - self.makedirs(self.path.parent(to_info)) + self.makedirs(self.parent(to_info)) with self.open(to_info, "wb") as fdest: shutil.copyfileobj( fobj, @@ -597,7 +725,7 @@ def get( from .local import localfs def get_file(rpath, lpath, **kwargs): - localfs.makedirs(localfs.path.parent(lpath), exist_ok=True) + localfs.makedirs(localfs.parent(lpath), exist_ok=True) self.fs.get_file(rpath, lpath, **kwargs) get_file = wrap_and_branch_callback(callback, get_file) @@ -618,7 +746,7 @@ def get_file(rpath, lpath, **kwargs): return localfs.makedirs(to_info, exist_ok=True) to_infos = [ - localfs.path.join(to_info, *self.path.relparts(info, from_info)) + localfs.join(to_info, *self.relparts(info, from_info)) for info in from_infos ] @@ -679,9 +807,9 @@ def find( def _make_args(paths: List[AnyFSPath]) -> Iterator[Tuple[str, str]]: for path in paths: - if prefix and not path.endswith(self.path.flavour.sep): - parent = self.path.parent(path) - yield parent, self.path.parts(path)[-1] + if prefix and not path.endswith(self.flavour.sep): + parent = self.parent(path) + yield parent, self.parts(path)[-1] else: yield path, "" diff --git a/src/dvc_objects/fs/generic.py b/src/dvc_objects/fs/generic.py index 2d724ed..1066e1b 100644 --- a/src/dvc_objects/fs/generic.py +++ b/src/dvc_objects/fs/generic.py @@ -394,16 +394,16 @@ def test_links( ) -> List["AnyFSPath"]: from .utils import tmp_fname - from_file = from_fs.path.join(from_path, tmp_fname()) - to_file = to_fs.path.join( - to_fs.path.parent(to_path), + from_file = from_fs.join(from_path, tmp_fname()) + to_file = to_fs.join( + to_fs.parent(to_path), tmp_fname(), ) - from_fs.makedirs(from_fs.path.parent(from_file)) + from_fs.makedirs(from_fs.parent(from_file)) with from_fs.open(from_file, "wb") as fobj: fobj.write(b"test") - to_fs.makedirs(to_fs.path.parent(to_file)) + to_fs.makedirs(to_fs.parent(to_file)) ret = [] try: diff --git a/src/dvc_objects/fs/local.py b/src/dvc_objects/fs/local.py index 679ad55..029a491 100644 --- a/src/dvc_objects/fs/local.py +++ b/src/dvc_objects/fs/local.py @@ -179,6 +179,7 @@ def modified(self, path): class LocalFileSystem(FileSystem): sep = os.sep + flavour = os.path protocol = "local" PARAM_CHECKSUM = "md5" PARAM_PATH = "path" @@ -189,17 +190,18 @@ class LocalFileSystem(FileSystem): def fs(self): return FsspecLocalFileSystem(**self.config) - @cached_property - def path(self): - from .path import LocalFileSystemPath + def getcwd(self): + return os.getcwd() + + def normpath(self, path: str) -> str: + return self.flavour.normpath(path) - return LocalFileSystemPath( - self.sep, getcwd=os.getcwd, realpath=os.path.realpath - ) + def realpath(self, path: str) -> str: + return self.flavour.realpath(path) def upload_fobj(self, fobj, to_info, **kwargs): - self.makedirs(self.path.parent(to_info)) - tmp_info = self.path.join(self.path.parent(to_info), tmp_fname("")) + self.makedirs(self.parent(to_info)) + tmp_info = self.join(self.parent(to_info), tmp_fname("")) try: with open(tmp_info, "wb+") as fdest: shutil.copyfileobj(fobj, fdest) diff --git a/src/dvc_objects/fs/path.py b/src/dvc_objects/fs/path.py deleted file mode 100644 index 1eb8e0d..0000000 --- a/src/dvc_objects/fs/path.py +++ /dev/null @@ -1,156 +0,0 @@ -import ntpath -import posixpath -from typing import ( - TYPE_CHECKING, - Callable, - Iterable, - Iterator, - Optional, - Sequence, - Tuple, -) -from urllib.parse import urlsplit, urlunsplit - -if TYPE_CHECKING: - from types import ModuleType - - -class Path: - def __init__( - self, - sep: str, - getcwd: Optional[Callable[[], str]] = None, - realpath: Optional[Callable[[str], str]] = None, - ): - def _getcwd() -> str: - return "" - - self.getcwd: Callable[[], str] = getcwd or _getcwd - self.realpath = realpath or self.abspath - - if sep == posixpath.sep: - self.flavour: ModuleType = posixpath - elif sep == ntpath.sep: - self.flavour = ntpath - else: - raise ValueError(f"unsupported separator '{sep}'") - - def chdir(self, path: str): - def _getcwd() -> str: - return path - - self.getcwd = _getcwd - - def join(self, *parts: str) -> str: - return self.flavour.join(*parts) - - def split(self, path: str) -> Tuple[str, str]: - return self.flavour.split(path) - - def splitext(self, path: str) -> Tuple[str, str]: - return self.flavour.splitext(path) - - def normpath(self, path: str) -> str: - if self.flavour == ntpath: - return self.flavour.normpath(path) - - parts = list(urlsplit(path)) - parts[2] = self.flavour.normpath(parts[2]) - return urlunsplit(parts) - - def isabs(self, path: str) -> bool: - return self.flavour.isabs(path) - - def abspath(self, path: str) -> str: - if not self.isabs(path): - path = self.join(self.getcwd(), path) - return self.normpath(path) - - def commonprefix(self, paths: Sequence[str]) -> str: - return self.flavour.commonprefix(paths) - - def commonpath(self, paths: Iterable[str]) -> str: - return self.flavour.commonpath(paths) - - def parts(self, path: str) -> Tuple[str, ...]: - drive, path = self.flavour.splitdrive(path.rstrip(self.flavour.sep)) - - ret = [] - while True: - path, part = self.flavour.split(path) - - if part: - ret.append(part) - continue - - if path: - ret.append(path) - - break - - ret.reverse() - - if drive: - ret = [drive, *ret] - - return tuple(ret) - - def parent(self, path: str) -> str: - return self.flavour.dirname(path) - - def dirname(self, path: str) -> str: - return self.parent(path) - - def parents(self, path: str) -> Iterator[str]: - while True: - parent = self.flavour.dirname(path) - if parent == path: - break - yield parent - path = parent - - def name(self, path: str) -> str: - return self.flavour.basename(path) - - def suffix(self, path: str) -> str: - name = self.name(path) - _, dot, suffix = name.partition(".") - return dot + suffix - - def with_name(self, path: str, name: str) -> str: - return self.join(self.parent(path), name) - - def with_suffix(self, path: str, suffix: str) -> str: - return self.splitext(path)[0] + suffix - - def isin(self, left: str, right: str) -> bool: - if left == right: - return False - try: - common = self.commonpath([left, right]) - except ValueError: - # Paths don't have the same drive - return False - return common == right - - def isin_or_eq(self, left: str, right: str) -> bool: - return left == right or self.isin(left, right) - - def overlaps(self, left: str, right: str) -> bool: - return self.isin_or_eq(left, right) or self.isin(right, left) - - def relpath(self, path: str, start: Optional[str] = None) -> str: - if start is None: - start = "." - return self.flavour.relpath(self.abspath(path), start=self.abspath(start)) - - def relparts(self, path: str, start: Optional[str] = None) -> Tuple[str, ...]: - return self.parts(self.relpath(path, start=start)) - - def as_posix(self, path: str) -> str: - return path.replace(self.flavour.sep, posixpath.sep) - - -class LocalFileSystemPath(Path): - def normpath(self, path: str) -> str: - return self.flavour.normpath(path) diff --git a/src/dvc_objects/fs/utils.py b/src/dvc_objects/fs/utils.py index f816777..7e001b9 100644 --- a/src/dvc_objects/fs/utils.py +++ b/src/dvc_objects/fs/utils.py @@ -184,11 +184,11 @@ def tmp_fname(fname: "AnyFSPath" = "") -> "AnyFSPath": def as_atomic( fs: "FileSystem", to_info: "AnyFSPath", create_parents: bool = False ) -> Iterator["AnyFSPath"]: - parent = fs.path.parent(to_info) + parent = fs.parent(to_info) if create_parents: fs.makedirs(parent, exist_ok=True) - tmp_info = fs.path.join(parent, tmp_fname()) + tmp_info = fs.join(parent, tmp_fname()) try: yield tmp_info except BaseException: @@ -289,7 +289,7 @@ def _list_query( callback: "Callback", ): with paths_lock: - parents = {fs.path.parent(path) for path in paths} + parents = {fs.parent(path) for path in paths} for parent in parents: with paths_lock: if not paths: diff --git a/tests/fs/test_localfs.py b/tests/fs/test_localfs.py index 5d67d56..21e10dd 100644 --- a/tests/fs/test_localfs.py +++ b/tests/fs/test_localfs.py @@ -116,12 +116,12 @@ def test_walk_detail(dir_path): assert len(dirs) == len(exp_dirs) assert len(files) == len(exp_files) for basename in exp_dirs: - assert fs.path.normpath(dirs[basename]["name"]) == os.path.join( + assert fs.normpath(dirs[basename]["name"]) == os.path.join( exp_root, basename ) assert dirs[basename]["type"] == "directory" for basename in exp_files: - assert fs.path.normpath(files[basename]["name"]) == os.path.join( + assert fs.normpath(files[basename]["name"]) == os.path.join( exp_root, basename ) assert files[basename]["type"] == "file" @@ -133,4 +133,4 @@ def test_walk_detail(dir_path): def test_normpath_with_newlines(): fs = LocalFileSystem() newline_path = os.path.join("one", "two\nthree") - assert fs.path.normpath(newline_path) == newline_path + assert fs.normpath(newline_path) == newline_path