From b83564d80e0b03aae4b7aebd813480078c7485ba Mon Sep 17 00:00:00 2001 From: Alexander Schepanovski Date: Fri, 1 Nov 2019 16:51:11 +0100 Subject: [PATCH] perf: refactor URLInfo to not use ParseResult inside This permites URLInfo construction from parts without building/reparsing string, which makes it around 20% faster. Code is also simpler and a bit shorter now. Other improvements: - added URLInfo.replace(path=...) - URLInfo._path now stringifies properly - protected against passing path not starting with / - do not allow query, params, fragments and passwords in urls (they were silently ignored previously) --- dvc/path_info.py | 169 ++++++++++++++++++------------------- dvc/remote/s3.py | 3 +- dvc/remote/ssh/__init__.py | 3 +- 3 files changed, 83 insertions(+), 92 deletions(-) diff --git a/dvc/path_info.py b/dvc/path_info.py index 3db44a4393..6707c8d81c 100644 --- a/dvc/path_info.py +++ b/dvc/path_info.py @@ -8,13 +8,12 @@ from dvc.utils.compat import str, builtin_str, basestring, is_py2 from dvc.utils.compat import pathlib, urlparse +from dvc.utils import relpath # On Python 2.7/Windows sys.getfilesystemencoding() is set to mbcs, # which is lossy, thus we can't use that, # see https://github.com/mcmtroffaes/pathlib2/issues/56. -from dvc.utils import relpath - if is_py2: fs_encoding = "utf-8" @@ -112,52 +111,80 @@ class PosixPathInfo(PathInfo, pathlib.PurePosixPath): pass +class _URLPathInfo(PosixPathInfo): + def __str__(self): + return self.__fspath__() + + __unicode__ = __str__ + + class _URLPathParents(object): - def __init__(self, pathcls, scheme, netloc, path): - self._scheme = scheme - self._netloc = netloc - self._parents = path.parents - self._pathcls = pathcls + def __init__(self, src): + self.src = src + self._parents = self.src._path.parents def __len__(self): return len(self._parents) def __getitem__(self, idx): - return self._pathcls.from_parts( - scheme=self._scheme, - netloc=self._netloc, - path=self._parents[idx].fspath, - ) + return self.src.replace(path=self._parents[idx]) def __repr__(self): - return "<{}.parents>".format(self._pathcls.__name__) + return "<{}.parents>".format(self.src) class URLInfo(object): DEFAULT_PORTS = {"http": 80, "https": 443, "ssh": 22, "hdfs": 0} def __init__(self, url): - self.parsed = urlparse(url) - assert self.parsed.scheme != "remote" + p = urlparse(url) + assert not p.query and not p.params and not p.fragment + assert p.password is None + + self.fill_parts(p.scheme, p.hostname, p.username, p.port, p.path) @classmethod def from_parts( - cls, scheme=None, netloc=None, host=None, user=None, port=None, path="" + cls, scheme=None, host=None, user=None, port=None, path="", netloc=None ): - assert scheme and (bool(host) ^ bool(netloc)) + assert bool(host) ^ bool(netloc) + + if netloc is not None: + return cls("{}://{}{}".format(scheme, netloc, path)) + + obj = cls.__new__(cls) + obj.fill_parts(scheme, host, user, port, path) + return obj + + def fill_parts(self, scheme, host, user, port, path): + assert scheme != "remote" + assert isinstance(path, (basestring, _URLPathInfo)) + + self.scheme, self.host, self.user = scheme, host, user + self.port = int(port) if port else self.DEFAULT_PORTS.get(self.scheme) + + if isinstance(path, _URLPathInfo): + self._spath = builtin_str(path) + self._path = path + else: + if path and path[0] != "/": + path = "/" + path + self._spath = path + + @property + def _base_parts(self): + return (self.scheme, self.host, self.user, self.port) + + @property + def parts(self): + return self._base_parts + self._path.parts - if netloc is None: - netloc = host - if user: - netloc = user + "@" + host - if port: - netloc += ":" + str(port) - return cls("{}://{}{}".format(scheme, netloc, path)) + def replace(self, path=None): + return self.from_parts(*self._base_parts, path=path) @cached_property def url(self): - p = self.parsed - return "{}://{}{}".format(p.scheme, self.netloc, p.path) + return "{}://{}{}".format(self.scheme, self.netloc, self._spath) def __str__(self): return self.url @@ -170,92 +197,60 @@ def __eq__(self, other): other = self.__class__(other) return ( self.__class__ == other.__class__ - and self.scheme == other.scheme - and self.netloc == other.netloc + and self._base_parts == other._base_parts and self._path == other._path ) def __hash__(self): - return hash(self.url) + return hash(self.parts) def __div__(self, other): - p = self.parsed - new_path = posixpath.join(p.path, str(other)) - if not new_path.startswith("/"): - new_path = "/" + new_path - new_url = "{}://{}{}".format(p.scheme, p.netloc, new_path) - return self.__class__(new_url) + return self.replace(path=posixpath.join(self._spath, other)) __truediv__ = __div__ - def __getattr__(self, name): - # When deepcopy is called, it creates and object without __init__, - # self.parsed is not initialized and it causes infinite recursion. - # More on this special casing here: - # https://stackoverflow.com/a/47300262/298182 - if name.startswith("__"): - raise AttributeError(name) - return getattr(self.parsed, name) - - @cached_property - def netloc(self): - p = self.parsed - netloc = p.hostname - if p.username: - netloc = p.username + "@" + netloc - if p.port and int(p.port) != self.DEFAULT_PORTS.get(p.scheme): - netloc += ":" + str(p.port) - return netloc - @property - def port(self): - return self.parsed.port or self.DEFAULT_PORTS.get(self.parsed.scheme) - - @property - def host(self): - return self.parsed.hostname - - @property - def user(self): - return self.parsed.username + def path(self): + return self._spath @cached_property def _path(self): - return PosixPathInfo(self.parsed.path) + return _URLPathInfo(self._spath) @property def name(self): return self._path.name - @property - def parts(self): - return (self.scheme, self.netloc) + self._path.parts + @cached_property + def netloc(self): + netloc = self.host + if self.user: + netloc = self.user + "@" + netloc + if self.port and int(self.port) != self.DEFAULT_PORTS.get(self.scheme): + netloc += ":" + str(self.port) + return netloc @property def bucket(self): - return self.parsed.netloc + return self.netloc @property def parent(self): - return self.from_parts( - scheme=self.scheme, - netloc=self.parsed.netloc, - path=self._path.parent.fspath, - ) + return self.replace(path=self._path.parent) @property def parents(self): - return _URLPathParents( - type(self), self.scheme, self.parsed.netloc, self._path - ) + return _URLPathParents(self) def relative_to(self, other): - if isinstance(other, str): - other = URLInfo(other) - if self.scheme != other.scheme or self.netloc != other.netloc: - raise ValueError( - "'{}' does not start with '{}'".format(self, other) - ) + if isinstance(other, basestring): + other = self.__class__(other) + if self.__class__ != other.__class__: + msg = "'{}' has incompatible class with '{}'".format(self, other) + raise ValueError(msg) + if self._base_parts != other._base_parts: + msg = "'{}' does not start with '{}'".format(self, other) + raise ValueError(msg) return self._path.relative_to(other._path) def isin(self, other): @@ -263,14 +258,12 @@ def isin(self, other): other = self.__class__(other) elif self.__class__ != other.__class__: return False - return ( - self.scheme == other.scheme - and self.netloc == other.netloc - and self._path.isin(other._path) + return self._base_parts == other._base_parts and self._path.isin( + other._path ) class CloudURLInfo(URLInfo): @property def path(self): - return self.parsed.path.lstrip("/") + return self._spath.lstrip("/") diff --git a/dvc/remote/s3.py b/dvc/remote/s3.py index 7557caecd6..169aee3ba9 100644 --- a/dvc/remote/s3.py +++ b/dvc/remote/s3.py @@ -4,7 +4,6 @@ import os import logging -import posixpath from funcy import cached_property from dvc.progress import Tqdm @@ -273,4 +272,4 @@ def _generate_download_url(self, path_info, expires=3600): def walk_files(self, path_info, max_items=None): for fname in self._list_paths(path_info, max_items): - yield path_info / posixpath.relpath(fname, path_info.path) + yield path_info.replace(path=fname) diff --git a/dvc/remote/ssh/__init__.py b/dvc/remote/ssh/__init__.py index c6b39b5b5a..7230f016ab 100644 --- a/dvc/remote/ssh/__init__.py +++ b/dvc/remote/ssh/__init__.py @@ -4,7 +4,6 @@ import itertools import io import os -import posixpath import getpass import logging import threading @@ -268,7 +267,7 @@ def list_cache_paths(self): def walk_files(self, path_info): with self.ssh(path_info) as ssh: for fname in ssh.walk_files(path_info.path): - yield path_info / posixpath.relpath(fname, path_info.path) + yield path_info.replace(path=fname) def makedirs(self, path_info): with self.ssh(path_info) as ssh: