From 7945fc5f8e10c8d5196f047a9f0003014e078ad5 Mon Sep 17 00:00:00 2001 From: pared Date: Thu, 20 Jun 2019 11:29:53 +0200 Subject: [PATCH 01/14] state: use ignore file handler when accessing metadata --- dvc/utils/fs.py | 39 ++++++++++++++++++++---------- tests/func/test_ignore.py | 48 +++++++++++++++++++++++++++++++++++++ tests/unit/utils/test_fs.py | 6 ++--- 3 files changed, 78 insertions(+), 15 deletions(-) diff --git a/dvc/utils/fs.py b/dvc/utils/fs.py index 8538aee1b6..e3afa06c62 100644 --- a/dvc/utils/fs.py +++ b/dvc/utils/fs.py @@ -7,7 +7,7 @@ from dvc.exceptions import DvcException from dvc.system import System -from dvc.utils import dvc_walk +from dvc.utils import dvc_walk, dict_md5 from dvc.utils.compat import str @@ -20,15 +20,21 @@ def get_inode(path): return inode -def get_mtime_and_size(path): - stat = os.stat(path) - size = stat.st_size - mtime = stat.st_mtime +def get_mtime_signature_and_size(path, ignore_file_handler=None): + base_stat = os.stat(path) + size = base_stat.st_size if os.path.isdir(path): - for root, dirs, files in dvc_walk(path): - for name in dirs + files: - entry = os.path.join(root, name) + files_mtimes = {} + for root, dirs, files in dvc_walk( + str(path), ignore_file_handler=ignore_file_handler + ): + for dir in dirs: + entry = os.path.join(root, dir) + size += os.path.getsize(entry) + + for file in files: + entry = os.path.join(root, file) try: stat = os.stat(entry) except OSError as exc: @@ -37,13 +43,22 @@ def get_mtime_and_size(path): raise continue size += stat.st_size - entry_mtime = stat.st_mtime - if entry_mtime > mtime: - mtime = entry_mtime + files_mtimes[entry] = stat.st_mtime + + # Why mtime for dir is actually dict_md5 from {file_path:mtime} pairs? + # In case of updating .dvcignore-d file in dir, mtime for directory + # would be updated. We don't want to detect that, yet we have to detect + # operations that results in dvc tracked directory mtime update and not + # file mtime updates (e.g. moving tracked file), hence we need to + # combine update mtimes with file_paths + mtime_signature = dict_md5(files_mtimes) + else: + mtime_signature = base_stat.st_mtime + mtime_signature = int(nanotime.timestamp(mtime_signature)) # State of files handled by dvc is stored in db as TEXT. # We cast results to string for later comparisons with stored values. - return str(int(nanotime.timestamp(mtime))), str(size) + return str(mtime_signature), str(size) class BasePathNotInCheckedPathException(DvcException): diff --git a/tests/func/test_ignore.py b/tests/func/test_ignore.py index a646d4afb7..afdd5770c3 100644 --- a/tests/func/test_ignore.py +++ b/tests/func/test_ignore.py @@ -1,7 +1,9 @@ import os +import shutil from dvc.ignore import DvcIgnore, DvcIgnoreFileHandler from dvc.utils.compat import cast_bytes +from dvc.utils.fs import get_mtime_signature_and_size from tests.basic_env import TestDvc @@ -56,3 +58,49 @@ def test_ignore_in_parent_dir(self): all_paths = self._get_all_paths() self.assertNotIn(forbidden_path, all_paths) + + +def test_metadata_unchanged_when_moving_ignored_file(dvc_repo, repo_dir): + new_data_path = repo_dir.DATA_SUB + "_new" + with open( + os.path.join(dvc_repo.root_dir, DvcIgnore.DVCIGNORE_FILE), "w" + ) as fobj: + fobj.write(repo_dir.DATA_SUB + "\n") + fobj.write(new_data_path + "\n") + + ignore_handler = DvcIgnoreFileHandler(dvc_repo.tree) + + mtime_sig, size = get_mtime_signature_and_size( + repo_dir.DATA_DIR, ignore_handler + ) + + shutil.move(repo_dir.DATA_SUB, new_data_path) + + new_mtime_sig, new_size = get_mtime_signature_and_size( + repo_dir.DATA_DIR, ignore_handler + ) + + assert new_mtime_sig == mtime_sig + assert new_size == size + + +def test_metadata_unchanged_on_ignored_file_deletion(dvc_repo, repo_dir): + with open( + os.path.join(dvc_repo.root_dir, DvcIgnore.DVCIGNORE_FILE), "w" + ) as fobj: + fobj.write(repo_dir.DATA_SUB + "\n") + + ignore_handler = DvcIgnoreFileHandler(dvc_repo.tree) + + mtime_sig, size = get_mtime_signature_and_size( + repo_dir.DATA_DIR, ignore_handler + ) + + os.remove(repo_dir.DATA_SUB) + + new_mtime_sig, new_size = get_mtime_signature_and_size( + repo_dir.DATA_DIR, ignore_handler + ) + + assert new_mtime_sig == mtime_sig + assert new_size == size diff --git a/tests/unit/utils/test_fs.py b/tests/unit/utils/test_fs.py index f34a914151..ebb62b8fb4 100644 --- a/tests/unit/utils/test_fs.py +++ b/tests/unit/utils/test_fs.py @@ -8,7 +8,7 @@ from dvc.utils import relpath from dvc.utils.compat import str from dvc.utils.fs import ( - get_mtime_and_size, + get_mtime_signature_and_size, contains_symlink_up_to, BasePathNotInCheckedPathException, get_parent_dirs_up_to, @@ -20,8 +20,8 @@ class TestMtimeAndSize(TestDir): def test(self): - file_time, file_size = get_mtime_and_size(self.DATA) - dir_time, dir_size = get_mtime_and_size(self.DATA_DIR) + file_time, file_size = get_mtime_signature_and_size(self.DATA) + dir_time, dir_size = get_mtime_signature_and_size(self.DATA_DIR) actual_file_size = os.path.getsize(self.DATA) actual_dir_size = ( From c1b1861beec7f8f9d2444c017a6dffe1edc9776c Mon Sep 17 00:00:00 2001 From: pared Date: Mon, 24 Jun 2019 12:24:26 +0200 Subject: [PATCH 02/14] fs: don't add dir size to overall size for dir --- dvc/utils/fs.py | 26 +++++++----------- tests/func/test_ignore.py | 55 +++++++++++++++++++++++-------------- tests/unit/utils/test_fs.py | 13 ++++----- 3 files changed, 50 insertions(+), 44 deletions(-) diff --git a/dvc/utils/fs.py b/dvc/utils/fs.py index e3afa06c62..0b3184fc77 100644 --- a/dvc/utils/fs.py +++ b/dvc/utils/fs.py @@ -20,18 +20,15 @@ def get_inode(path): return inode -def get_mtime_signature_and_size(path, ignore_file_handler=None): +def get_mtime_and_size(path, ignore_file_handler=None): base_stat = os.stat(path) - size = base_stat.st_size + size = 0 if os.path.isdir(path): files_mtimes = {} for root, dirs, files in dvc_walk( - str(path), ignore_file_handler=ignore_file_handler + path, ignore_file_handler=ignore_file_handler ): - for dir in dirs: - entry = os.path.join(root, dir) - size += os.path.getsize(entry) for file in files: entry = os.path.join(root, file) @@ -45,20 +42,17 @@ def get_mtime_signature_and_size(path, ignore_file_handler=None): size += stat.st_size files_mtimes[entry] = stat.st_mtime - # Why mtime for dir is actually dict_md5 from {file_path:mtime} pairs? - # In case of updating .dvcignore-d file in dir, mtime for directory - # would be updated. We don't want to detect that, yet we have to detect - # operations that results in dvc tracked directory mtime update and not - # file mtime updates (e.g. moving tracked file), hence we need to - # combine update mtimes with file_paths - mtime_signature = dict_md5(files_mtimes) + # We track file changes and moves, which cannot be detected with simply + # max(mtime(f) for f in non_ignored_files) + mtime = dict_md5(files_mtimes) else: - mtime_signature = base_stat.st_mtime - mtime_signature = int(nanotime.timestamp(mtime_signature)) + size += base_stat.st_size + mtime = base_stat.st_mtime + mtime = int(nanotime.timestamp(mtime)) # State of files handled by dvc is stored in db as TEXT. # We cast results to string for later comparisons with stored values. - return str(mtime_signature), str(size) + return str(mtime), str(size) class BasePathNotInCheckedPathException(DvcException): diff --git a/tests/func/test_ignore.py b/tests/func/test_ignore.py index afdd5770c3..23b1ed3568 100644 --- a/tests/func/test_ignore.py +++ b/tests/func/test_ignore.py @@ -3,7 +3,7 @@ from dvc.ignore import DvcIgnore, DvcIgnoreFileHandler from dvc.utils.compat import cast_bytes -from dvc.utils.fs import get_mtime_signature_and_size +from dvc.utils.fs import get_mtime_and_size from tests.basic_env import TestDvc @@ -62,21 +62,18 @@ def test_ignore_in_parent_dir(self): def test_metadata_unchanged_when_moving_ignored_file(dvc_repo, repo_dir): new_data_path = repo_dir.DATA_SUB + "_new" - with open( - os.path.join(dvc_repo.root_dir, DvcIgnore.DVCIGNORE_FILE), "w" - ) as fobj: - fobj.write(repo_dir.DATA_SUB + "\n") - fobj.write(new_data_path + "\n") + + ignore_file = os.path.join(dvc_repo.root_dir, DvcIgnore.DVCIGNORE_FILE) + + repo_dir.create(ignore_file, "\n".join([repo_dir.DATA_SUB, new_data_path])) ignore_handler = DvcIgnoreFileHandler(dvc_repo.tree) - mtime_sig, size = get_mtime_signature_and_size( - repo_dir.DATA_DIR, ignore_handler - ) + mtime_sig, size = get_mtime_and_size(repo_dir.DATA_DIR, ignore_handler) shutil.move(repo_dir.DATA_SUB, new_data_path) - new_mtime_sig, new_size = get_mtime_signature_and_size( + new_mtime_sig, new_size = get_mtime_and_size( repo_dir.DATA_DIR, ignore_handler ) @@ -84,23 +81,41 @@ def test_metadata_unchanged_when_moving_ignored_file(dvc_repo, repo_dir): assert new_size == size -def test_metadata_unchanged_on_ignored_file_deletion(dvc_repo, repo_dir): - with open( - os.path.join(dvc_repo.root_dir, DvcIgnore.DVCIGNORE_FILE), "w" - ) as fobj: - fobj.write(repo_dir.DATA_SUB + "\n") +def test_mtime_changed_when_moving_non_ignored_file(dvc_repo, repo_dir): + new_data_path = repo_dir.DATA_SUB + "_new" + ignore_handler = DvcIgnoreFileHandler(dvc_repo.tree) + mtime, size = get_mtime_and_size(repo_dir.DATA_DIR, ignore_handler) + + shutil.move(repo_dir.DATA_SUB, new_data_path) + new_mtime, new_size = get_mtime_and_size(repo_dir.DATA_DIR, ignore_handler) + + assert new_mtime != mtime + assert new_size == size + +def test_metadata_unchanged_on_ignored_file_deletion(dvc_repo, repo_dir): + ignore_file = os.path.join(dvc_repo.root_dir, DvcIgnore.DVCIGNORE_FILE) + repo_dir.create(ignore_file, repo_dir.DATA_SUB) ignore_handler = DvcIgnoreFileHandler(dvc_repo.tree) + mtime_sig, size = get_mtime_and_size(repo_dir.DATA_DIR, ignore_handler) - mtime_sig, size = get_mtime_signature_and_size( + os.remove(repo_dir.DATA_SUB) + new_mtime_sig, new_size = get_mtime_and_size( repo_dir.DATA_DIR, ignore_handler ) - os.remove(repo_dir.DATA_SUB) + assert new_mtime_sig == mtime_sig + assert new_size == size + - new_mtime_sig, new_size = get_mtime_signature_and_size( +def test_metadata_changed_on_non_ignored_file_deletion(dvc_repo, repo_dir): + ignore_handler = DvcIgnoreFileHandler(dvc_repo.tree) + mtime_sig, size = get_mtime_and_size(repo_dir.DATA_DIR, ignore_handler) + + os.remove(repo_dir.DATA_SUB) + new_mtime_sig, new_size = get_mtime_and_size( repo_dir.DATA_DIR, ignore_handler ) - assert new_mtime_sig == mtime_sig - assert new_size == size + assert new_mtime_sig != mtime_sig + assert new_size != size diff --git a/tests/unit/utils/test_fs.py b/tests/unit/utils/test_fs.py index ebb62b8fb4..abcb5bc25f 100644 --- a/tests/unit/utils/test_fs.py +++ b/tests/unit/utils/test_fs.py @@ -8,7 +8,7 @@ from dvc.utils import relpath from dvc.utils.compat import str from dvc.utils.fs import ( - get_mtime_signature_and_size, + get_mtime_and_size, contains_symlink_up_to, BasePathNotInCheckedPathException, get_parent_dirs_up_to, @@ -20,15 +20,12 @@ class TestMtimeAndSize(TestDir): def test(self): - file_time, file_size = get_mtime_signature_and_size(self.DATA) - dir_time, dir_size = get_mtime_signature_and_size(self.DATA_DIR) + file_time, file_size = get_mtime_and_size(self.DATA) + dir_time, dir_size = get_mtime_and_size(self.DATA_DIR) actual_file_size = os.path.getsize(self.DATA) - actual_dir_size = ( - os.path.getsize(self.DATA_DIR) - + os.path.getsize(self.DATA) - + os.path.getsize(self.DATA_SUB_DIR) - + os.path.getsize(self.DATA_SUB) + actual_dir_size = os.path.getsize(self.DATA) + os.path.getsize( + self.DATA_SUB ) self.assertIs(type(file_time), str) From bfa49326889dcfa849c2982e313f1727a2265f19 Mon Sep 17 00:00:00 2001 From: prd Date: Mon, 24 Jun 2019 12:00:45 -0600 Subject: [PATCH 03/14] ignore: use posix-like path on windows --- dvc/ignore.py | 52 ++++++++++++++++-------------------- dvc/remote/local/__init__.py | 12 ++++----- dvc/repo/__init__.py | 8 +++--- dvc/repo/add.py | 2 +- dvc/scm/git/tree.py | 19 +++---------- dvc/scm/tree.py | 9 +++---- dvc/utils/__init__.py | 18 +++---------- dvc/utils/fs.py | 6 ++--- tests/func/test_checkout.py | 2 +- tests/func/test_ignore.py | 44 +++++++++++++++--------------- tests/utils/__init__.py | 4 +++ 11 files changed, 72 insertions(+), 104 deletions(-) diff --git a/dvc/ignore.py b/dvc/ignore.py index 5c91d302ba..fbd575bb61 100644 --- a/dvc/ignore.py +++ b/dvc/ignore.py @@ -8,16 +8,16 @@ from dvc.utils.fs import get_parent_dirs_up_to -class DvcIgnoreFileHandler(object): - def __init__(self, tree): - self.tree = tree - - def read_patterns(self, path): - with self.tree.open(path) as fobj: - return PathSpec.from_lines(GitWildMatchPattern, fobj) - - def get_repo_root(self): - return self.tree.tree_root +# class DvcIgnoreFileHandler(object): +# def __init__(self, tree): +# self.tree = tree +# +# def read_patterns(self, path): +# with self.tree.open(path) as fobj: +# return PathSpec.from_lines(GitWildMatchPattern, fobj) +# +# def get_repo_root(self): +# return self.tree.tree_root class DvcIgnore(object): @@ -28,11 +28,12 @@ def __call__(self, root, dirs, files): class DvcIgnoreFromFile(DvcIgnore): - def __init__(self, ignore_file_path, ignore_handler): + def __init__(self, ignore_file_path, tree): self.ignore_file_path = ignore_file_path self.dirname = os.path.normpath(os.path.dirname(ignore_file_path)) - self.ignore_spec = ignore_handler.read_patterns(ignore_file_path) + with tree.open(ignore_file_path) as fobj: + self.ignore_spec = PathSpec.from_lines(GitWildMatchPattern, fobj) def __call__(self, root, dirs, files): files = [f for f in files if not self.matches(root, f)] @@ -43,9 +44,6 @@ def __call__(self, root, dirs, files): def matches(self, dirname, basename): abs_path = os.path.join(dirname, basename) relative_path = relpath(abs_path, self.dirname) - if os.name == "nt": - relative_path = relative_path.replace("\\", "/") - return self.ignore_spec.match_file(relative_path) def __hash__(self): @@ -72,7 +70,7 @@ def __call__(self, root, dirs, files): class DvcIgnoreFilter(object): - def __init__(self, wdir, ignore_file_handler=None): + def __init__(self, wdir, tree): self.ignores = [ DvcIgnoreDir(".git"), DvcIgnoreDir(".hg"), @@ -80,28 +78,24 @@ def __init__(self, wdir, ignore_file_handler=None): DvcIgnoreFile(".dvcignore"), ] - self.ignore_file_handler = ignore_file_handler + self.tree = tree self._process_ignores_in_parent_dirs(wdir) def _process_ignores_in_parent_dirs(self, wdir): - if self.ignore_file_handler: - wdir = os.path.normpath(os.path.abspath(wdir)) - ignore_search_end_dir = self.ignore_file_handler.get_repo_root() - parent_dirs = get_parent_dirs_up_to(wdir, ignore_search_end_dir) - for d in parent_dirs: - self.update(d) + wdir = os.path.normpath(os.path.abspath(wdir)) + ignore_search_end_dir = self.tree.tree_root + parent_dirs = get_parent_dirs_up_to(wdir, ignore_search_end_dir) + for d in parent_dirs: + self.update(d) def update(self, wdir): ignore_file_path = os.path.join(wdir, DvcIgnore.DVCIGNORE_FILE) - if os.path.exists(ignore_file_path): - file_ignore = DvcIgnoreFromFile( - ignore_file_path, ignore_handler=self.ignore_file_handler - ) + if self.tree.exists(ignore_file_path): + file_ignore = DvcIgnoreFromFile(ignore_file_path, tree=self.tree) self.ignores.append(file_ignore) def __call__(self, root, dirs, files): - if self.ignore_file_handler: - self.update(root) + self.update(root) for ignore in self.ignores: dirs, files = ignore(root, dirs, files) diff --git a/dvc/remote/local/__init__.py b/dvc/remote/local/__init__.py index 1f82690157..0f89c1063f 100644 --- a/dvc/remote/local/__init__.py +++ b/dvc/remote/local/__init__.py @@ -477,13 +477,11 @@ def _unprotect_file(path): os.chmod(path, os.stat(path).st_mode | stat.S_IWRITE) - @staticmethod - def _unprotect_dir(path): - for path in walk_files(path): + def _unprotect_dir(self, path): + for path in walk_files(path, self.repo.tree): RemoteLOCAL._unprotect_file(path) - @staticmethod - def unprotect(path_info): + def unprotect(self, path_info): path = path_info.fspath if not os.path.exists(path): raise DvcException( @@ -491,9 +489,9 @@ def unprotect(path_info): ) if os.path.isdir(path): - RemoteLOCAL._unprotect_dir(path) + self._unprotect_dir(path) else: - RemoteLOCAL._unprotect_file(path) + self._unprotect_file(path) @staticmethod def protect(path_info): diff --git a/dvc/repo/__init__.py b/dvc/repo/__init__.py index fc90f0ed71..01621df4a5 100644 --- a/dvc/repo/__init__.py +++ b/dvc/repo/__init__.py @@ -12,7 +12,8 @@ TargetNotDirectoryError, OutputFileMissingError, ) -from dvc.ignore import DvcIgnoreFileHandler + +# from dvc.ignore import DvcIgnoreFileHandler from dvc.path_info import PathInfo from dvc.utils.compat import open as _open, fspath_py35 from dvc.utils import relpath @@ -376,10 +377,7 @@ def stages(self, from_directory=None, check_dag=True): stages = [] outs = [] - ignore_file_handler = DvcIgnoreFileHandler(self.tree) - for root, dirs, files in self.tree.walk( - from_directory, ignore_file_handler=ignore_file_handler - ): + for root, dirs, files in self.tree.walk(from_directory): for fname in files: path = os.path.join(root, fname) if not Stage.is_valid_filename(path): diff --git a/dvc/repo/add.py b/dvc/repo/add.py index 0b02a19aba..2642ffa6c0 100644 --- a/dvc/repo/add.py +++ b/dvc/repo/add.py @@ -44,7 +44,7 @@ def _find_all_targets(repo, target, recursive): if os.path.isdir(target) and recursive: return [ fname - for fname in walk_files(target) + for fname in walk_files(target, repo.tree) if not repo.is_dvc_internal(fname) if not Stage.is_stage_file(fname) if not repo.scm.belongs_to_scm(fname) diff --git a/dvc/scm/git/tree.py b/dvc/scm/git/tree.py index 89c593c54d..9651fad12e 100644 --- a/dvc/scm/git/tree.py +++ b/dvc/scm/git/tree.py @@ -103,13 +103,7 @@ def git_object_by_path(self, path): tree = tree[i] return tree - def _walk( - self, - tree, - topdown=True, - ignore_file_handler=None, - dvc_ignore_filter=None, - ): + def _walk(self, tree, topdown=True, dvc_ignore_filter=None): dirs, nondirs = [], [] for i in tree: if i.mode == GIT_MODE_DIR: @@ -119,25 +113,20 @@ def _walk( if topdown: if not dvc_ignore_filter: - dvc_ignore_filter = DvcIgnoreFilter( - tree.abspath, ignore_file_handler=ignore_file_handler - ) + dvc_ignore_filter = DvcIgnoreFilter(tree.abspath, self) dirs, nondirs = dvc_ignore_filter(tree.path, dirs, nondirs) yield os.path.normpath(tree.abspath), dirs, nondirs for i in dirs: for x in self._walk( - tree[i], - topdown=True, - ignore_file_handler=ignore_file_handler, - dvc_ignore_filter=dvc_ignore_filter, + tree[i], topdown=True, dvc_ignore_filter=dvc_ignore_filter ): yield x if not topdown: yield os.path.normpath(tree.abspath), dirs, nondirs - def walk(self, top, topdown=True, ignore_file_handler=None): + def walk(self, top, topdown=True): """Directory tree generator. See `os.walk` for the docs. Differences: diff --git a/dvc/scm/tree.py b/dvc/scm/tree.py index 8a555902f9..0944668551 100644 --- a/dvc/scm/tree.py +++ b/dvc/scm/tree.py @@ -23,7 +23,7 @@ def isdir(self, path): def isfile(self, path): """Test whether a path is a regular file""" - def walk(self, top, topdown=True, ignore_file_handler=None): + def walk(self, top, topdown=True): """Directory tree generator. See `os.walk` for the docs. Differences: @@ -60,7 +60,7 @@ def isfile(self, path): """Test whether a path is a regular file""" return os.path.isfile(path) - def walk(self, top, topdown=True, ignore_file_handler=None): + def walk(self, top, topdown=True): """Directory tree generator. See `os.walk` for the docs. Differences: @@ -72,9 +72,6 @@ def onerror(e): raise e for root, dirs, files in dvc_walk( - os.path.abspath(top), - topdown=topdown, - onerror=onerror, - ignore_file_handler=ignore_file_handler, + os.path.abspath(top), self, topdown=topdown ): yield os.path.normpath(root), dirs, files diff --git a/dvc/utils/__init__.py b/dvc/utils/__init__.py index a372d90d4e..4117ea0c56 100644 --- a/dvc/utils/__init__.py +++ b/dvc/utils/__init__.py @@ -272,13 +272,7 @@ def to_yaml_string(data): return stream.getvalue() -def dvc_walk( - top, - topdown=True, - onerror=None, - followlinks=False, - ignore_file_handler=None, -): +def dvc_walk(top, tree, topdown=True, onerror=None, followlinks=False): """ Proxy for `os.walk` directory tree generator. Utilizes DvcIgnoreFilter functionality. @@ -287,9 +281,7 @@ def dvc_walk( if topdown: from dvc.ignore import DvcIgnoreFilter - ignore_filter = DvcIgnoreFilter( - top, ignore_file_handler=ignore_file_handler - ) + ignore_filter = DvcIgnoreFilter(top, tree) for root, dirs, files in os.walk( top, topdown=topdown, onerror=onerror, followlinks=followlinks @@ -301,10 +293,8 @@ def dvc_walk( yield root, dirs, files -def walk_files(directory, ignore_file_handler=None): - for root, _, files in dvc_walk( - directory, ignore_file_handler=ignore_file_handler - ): +def walk_files(directory, tree): + for root, _, files in dvc_walk(directory, tree): for f in files: yield os.path.join(root, f) diff --git a/dvc/utils/fs.py b/dvc/utils/fs.py index 0b3184fc77..30f592f1a6 100644 --- a/dvc/utils/fs.py +++ b/dvc/utils/fs.py @@ -20,15 +20,13 @@ def get_inode(path): return inode -def get_mtime_and_size(path, ignore_file_handler=None): +def get_mtime_and_size(path, tree): base_stat = os.stat(path) size = 0 if os.path.isdir(path): files_mtimes = {} - for root, dirs, files in dvc_walk( - path, ignore_file_handler=ignore_file_handler - ): + for root, dirs, files in dvc_walk(path, tree=tree): for file in files: entry = os.path.join(root, file) diff --git a/tests/func/test_checkout.py b/tests/func/test_checkout.py index c984ff4100..972d20f18b 100644 --- a/tests/func/test_checkout.py +++ b/tests/func/test_checkout.py @@ -136,7 +136,7 @@ def outs_info(self, stage): paths = [ path for output in stage["outs"] - for path in walk_files(output["path"]) + for path in walk_files(output["path"], self.dvc.tree) ] return [ diff --git a/tests/func/test_ignore.py b/tests/func/test_ignore.py index 23b1ed3568..e12a78cf10 100644 --- a/tests/func/test_ignore.py +++ b/tests/func/test_ignore.py @@ -1,24 +1,22 @@ import os import shutil -from dvc.ignore import DvcIgnore, DvcIgnoreFileHandler +from dvc.ignore import DvcIgnore from dvc.utils.compat import cast_bytes from dvc.utils.fs import get_mtime_and_size from tests.basic_env import TestDvc +from tests.utils import to_posixpath class TestDvcIgnore(TestDvc): def setUp(self): super(TestDvcIgnore, self).setUp() - self.ignore_file_handler = DvcIgnoreFileHandler(self.dvc.tree) + # self.ignore_file_handler = DvcIgnoreFileHandler(self.dvc.tree) def _get_all_paths(self): paths = [] - ignore_file_handler = DvcIgnoreFileHandler(self.dvc.tree) - for root, dirs, files in self.dvc.tree.walk( - self.dvc.root_dir, ignore_file_handler=ignore_file_handler - ): + for root, dirs, files in self.dvc.tree.walk(self.dvc.root_dir): for dname in dirs: paths.append(os.path.join(root, dname)) @@ -64,17 +62,19 @@ def test_metadata_unchanged_when_moving_ignored_file(dvc_repo, repo_dir): new_data_path = repo_dir.DATA_SUB + "_new" ignore_file = os.path.join(dvc_repo.root_dir, DvcIgnore.DVCIGNORE_FILE) + repo_dir.create( + ignore_file, + "\n".join( + [to_posixpath(repo_dir.DATA_SUB), to_posixpath(new_data_path)] + ), + ) - repo_dir.create(ignore_file, "\n".join([repo_dir.DATA_SUB, new_data_path])) - - ignore_handler = DvcIgnoreFileHandler(dvc_repo.tree) - - mtime_sig, size = get_mtime_and_size(repo_dir.DATA_DIR, ignore_handler) + mtime_sig, size = get_mtime_and_size(repo_dir.DATA_DIR, dvc_repo.tree) shutil.move(repo_dir.DATA_SUB, new_data_path) new_mtime_sig, new_size = get_mtime_and_size( - repo_dir.DATA_DIR, ignore_handler + repo_dir.DATA_DIR, dvc_repo.tree ) assert new_mtime_sig == mtime_sig @@ -83,11 +83,11 @@ def test_metadata_unchanged_when_moving_ignored_file(dvc_repo, repo_dir): def test_mtime_changed_when_moving_non_ignored_file(dvc_repo, repo_dir): new_data_path = repo_dir.DATA_SUB + "_new" - ignore_handler = DvcIgnoreFileHandler(dvc_repo.tree) - mtime, size = get_mtime_and_size(repo_dir.DATA_DIR, ignore_handler) + # ignore_handler = DvcIgnoreFileHandler(dvc_repo.tree) + mtime, size = get_mtime_and_size(repo_dir.DATA_DIR, dvc_repo.tree) shutil.move(repo_dir.DATA_SUB, new_data_path) - new_mtime, new_size = get_mtime_and_size(repo_dir.DATA_DIR, ignore_handler) + new_mtime, new_size = get_mtime_and_size(repo_dir.DATA_DIR, dvc_repo.tree) assert new_mtime != mtime assert new_size == size @@ -95,13 +95,13 @@ def test_mtime_changed_when_moving_non_ignored_file(dvc_repo, repo_dir): def test_metadata_unchanged_on_ignored_file_deletion(dvc_repo, repo_dir): ignore_file = os.path.join(dvc_repo.root_dir, DvcIgnore.DVCIGNORE_FILE) - repo_dir.create(ignore_file, repo_dir.DATA_SUB) - ignore_handler = DvcIgnoreFileHandler(dvc_repo.tree) - mtime_sig, size = get_mtime_and_size(repo_dir.DATA_DIR, ignore_handler) + repo_dir.create(ignore_file, to_posixpath(repo_dir.DATA_SUB)) + # ignore_handler = DvcIgnoreFileHandler(dvc_repo.tree) + mtime_sig, size = get_mtime_and_size(repo_dir.DATA_DIR, dvc_repo.tree) os.remove(repo_dir.DATA_SUB) new_mtime_sig, new_size = get_mtime_and_size( - repo_dir.DATA_DIR, ignore_handler + repo_dir.DATA_DIR, dvc_repo.tree ) assert new_mtime_sig == mtime_sig @@ -109,12 +109,12 @@ def test_metadata_unchanged_on_ignored_file_deletion(dvc_repo, repo_dir): def test_metadata_changed_on_non_ignored_file_deletion(dvc_repo, repo_dir): - ignore_handler = DvcIgnoreFileHandler(dvc_repo.tree) - mtime_sig, size = get_mtime_and_size(repo_dir.DATA_DIR, ignore_handler) + # ignore_handler = DvcIgnoreFileHandler(dvc_repo.tree) + mtime_sig, size = get_mtime_and_size(repo_dir.DATA_DIR, dvc_repo.tree) os.remove(repo_dir.DATA_SUB) new_mtime_sig, new_size = get_mtime_and_size( - repo_dir.DATA_DIR, ignore_handler + repo_dir.DATA_DIR, dvc_repo.tree ) assert new_mtime_sig != mtime_sig diff --git a/tests/utils/__init__.py b/tests/utils/__init__.py index b43af8d066..cd50069c98 100644 --- a/tests/utils/__init__.py +++ b/tests/utils/__init__.py @@ -40,3 +40,7 @@ def trees_equal(dir_path_1, dir_path_2): for d in comparison.common_dirs: trees_equal(os.path.join(dir_path_1, d), os.path.join(dir_path_2, d)) + + +def to_posixpath(path): + return path.replace("\\", "/") From 6d2158a1fd9142a35a406913e753b023a3da3f8e Mon Sep 17 00:00:00 2001 From: pared Date: Wed, 26 Jun 2019 00:00:02 +0200 Subject: [PATCH 04/14] ignore: remove tree from dvc_walk arguments --- dvc/ignore.py | 45 ++++++++++++++++++------------------ dvc/remote/local/__init__.py | 12 ++++++---- dvc/repo/__init__.py | 2 -- dvc/repo/add.py | 2 +- dvc/scm/git/tree.py | 2 +- dvc/scm/tree.py | 2 +- dvc/utils/__init__.py | 8 +++---- dvc/utils/fs.py | 4 ++-- tests/func/test_checkout.py | 2 +- tests/func/test_ignore.py | 26 +++++++-------------- tests/unit/test_ignore.py | 16 +++++-------- 11 files changed, 53 insertions(+), 68 deletions(-) diff --git a/dvc/ignore.py b/dvc/ignore.py index fbd575bb61..81cc5679c3 100644 --- a/dvc/ignore.py +++ b/dvc/ignore.py @@ -4,22 +4,12 @@ from pathspec import PathSpec from pathspec.patterns import GitWildMatchPattern +from dvc.exceptions import NotDvcRepoError +from dvc.scm.tree import WorkingTree from dvc.utils import relpath from dvc.utils.fs import get_parent_dirs_up_to -# class DvcIgnoreFileHandler(object): -# def __init__(self, tree): -# self.tree = tree -# -# def read_patterns(self, path): -# with self.tree.open(path) as fobj: -# return PathSpec.from_lines(GitWildMatchPattern, fobj) -# -# def get_repo_root(self): -# return self.tree.tree_root - - class DvcIgnore(object): DVCIGNORE_FILE = ".dvcignore" @@ -70,7 +60,7 @@ def __call__(self, root, dirs, files): class DvcIgnoreFilter(object): - def __init__(self, wdir, tree): + def __init__(self, wdir): self.ignores = [ DvcIgnoreDir(".git"), DvcIgnoreDir(".hg"), @@ -78,21 +68,30 @@ def __init__(self, wdir, tree): DvcIgnoreFile(".dvcignore"), ] - self.tree = tree + from dvc.repo import Repo + + try: + self.tree = WorkingTree(Repo.find_root()) + except NotDvcRepoError: + self.tree = None self._process_ignores_in_parent_dirs(wdir) def _process_ignores_in_parent_dirs(self, wdir): - wdir = os.path.normpath(os.path.abspath(wdir)) - ignore_search_end_dir = self.tree.tree_root - parent_dirs = get_parent_dirs_up_to(wdir, ignore_search_end_dir) - for d in parent_dirs: - self.update(d) + if self.tree: + wdir = os.path.normpath(os.path.abspath(wdir)) + ignore_search_end_dir = self.tree.tree_root + parent_dirs = get_parent_dirs_up_to(wdir, ignore_search_end_dir) + for d in parent_dirs: + self.update(d) def update(self, wdir): - ignore_file_path = os.path.join(wdir, DvcIgnore.DVCIGNORE_FILE) - if self.tree.exists(ignore_file_path): - file_ignore = DvcIgnoreFromFile(ignore_file_path, tree=self.tree) - self.ignores.append(file_ignore) + if self.tree: + ignore_file_path = os.path.join(wdir, DvcIgnore.DVCIGNORE_FILE) + if self.tree.exists(ignore_file_path): + file_ignore = DvcIgnoreFromFile( + ignore_file_path, tree=self.tree + ) + self.ignores.append(file_ignore) def __call__(self, root, dirs, files): self.update(root) diff --git a/dvc/remote/local/__init__.py b/dvc/remote/local/__init__.py index 0f89c1063f..1f82690157 100644 --- a/dvc/remote/local/__init__.py +++ b/dvc/remote/local/__init__.py @@ -477,11 +477,13 @@ def _unprotect_file(path): os.chmod(path, os.stat(path).st_mode | stat.S_IWRITE) - def _unprotect_dir(self, path): - for path in walk_files(path, self.repo.tree): + @staticmethod + def _unprotect_dir(path): + for path in walk_files(path): RemoteLOCAL._unprotect_file(path) - def unprotect(self, path_info): + @staticmethod + def unprotect(path_info): path = path_info.fspath if not os.path.exists(path): raise DvcException( @@ -489,9 +491,9 @@ def unprotect(self, path_info): ) if os.path.isdir(path): - self._unprotect_dir(path) + RemoteLOCAL._unprotect_dir(path) else: - self._unprotect_file(path) + RemoteLOCAL._unprotect_file(path) @staticmethod def protect(path_info): diff --git a/dvc/repo/__init__.py b/dvc/repo/__init__.py index 01621df4a5..fd974682b1 100644 --- a/dvc/repo/__init__.py +++ b/dvc/repo/__init__.py @@ -12,8 +12,6 @@ TargetNotDirectoryError, OutputFileMissingError, ) - -# from dvc.ignore import DvcIgnoreFileHandler from dvc.path_info import PathInfo from dvc.utils.compat import open as _open, fspath_py35 from dvc.utils import relpath diff --git a/dvc/repo/add.py b/dvc/repo/add.py index 2642ffa6c0..0b02a19aba 100644 --- a/dvc/repo/add.py +++ b/dvc/repo/add.py @@ -44,7 +44,7 @@ def _find_all_targets(repo, target, recursive): if os.path.isdir(target) and recursive: return [ fname - for fname in walk_files(target, repo.tree) + for fname in walk_files(target) if not repo.is_dvc_internal(fname) if not Stage.is_stage_file(fname) if not repo.scm.belongs_to_scm(fname) diff --git a/dvc/scm/git/tree.py b/dvc/scm/git/tree.py index 9651fad12e..8c9bba3bfd 100644 --- a/dvc/scm/git/tree.py +++ b/dvc/scm/git/tree.py @@ -113,7 +113,7 @@ def _walk(self, tree, topdown=True, dvc_ignore_filter=None): if topdown: if not dvc_ignore_filter: - dvc_ignore_filter = DvcIgnoreFilter(tree.abspath, self) + dvc_ignore_filter = DvcIgnoreFilter(tree.abspath) dirs, nondirs = dvc_ignore_filter(tree.path, dirs, nondirs) yield os.path.normpath(tree.abspath), dirs, nondirs diff --git a/dvc/scm/tree.py b/dvc/scm/tree.py index 0944668551..c12e12b9b3 100644 --- a/dvc/scm/tree.py +++ b/dvc/scm/tree.py @@ -72,6 +72,6 @@ def onerror(e): raise e for root, dirs, files in dvc_walk( - os.path.abspath(top), self, topdown=topdown + os.path.abspath(top), topdown=topdown, onerror=onerror ): yield os.path.normpath(root), dirs, files diff --git a/dvc/utils/__init__.py b/dvc/utils/__init__.py index 4117ea0c56..15e4935f13 100644 --- a/dvc/utils/__init__.py +++ b/dvc/utils/__init__.py @@ -272,7 +272,7 @@ def to_yaml_string(data): return stream.getvalue() -def dvc_walk(top, tree, topdown=True, onerror=None, followlinks=False): +def dvc_walk(top, topdown=True, onerror=None, followlinks=False): """ Proxy for `os.walk` directory tree generator. Utilizes DvcIgnoreFilter functionality. @@ -281,7 +281,7 @@ def dvc_walk(top, tree, topdown=True, onerror=None, followlinks=False): if topdown: from dvc.ignore import DvcIgnoreFilter - ignore_filter = DvcIgnoreFilter(top, tree) + ignore_filter = DvcIgnoreFilter(top) for root, dirs, files in os.walk( top, topdown=topdown, onerror=onerror, followlinks=followlinks @@ -293,8 +293,8 @@ def dvc_walk(top, tree, topdown=True, onerror=None, followlinks=False): yield root, dirs, files -def walk_files(directory, tree): - for root, _, files in dvc_walk(directory, tree): +def walk_files(directory): + for root, _, files in dvc_walk(directory): for f in files: yield os.path.join(root, f) diff --git a/dvc/utils/fs.py b/dvc/utils/fs.py index 30f592f1a6..f5a95c127a 100644 --- a/dvc/utils/fs.py +++ b/dvc/utils/fs.py @@ -20,13 +20,13 @@ def get_inode(path): return inode -def get_mtime_and_size(path, tree): +def get_mtime_and_size(path): base_stat = os.stat(path) size = 0 if os.path.isdir(path): files_mtimes = {} - for root, dirs, files in dvc_walk(path, tree=tree): + for root, dirs, files in dvc_walk(path): for file in files: entry = os.path.join(root, file) diff --git a/tests/func/test_checkout.py b/tests/func/test_checkout.py index 972d20f18b..c984ff4100 100644 --- a/tests/func/test_checkout.py +++ b/tests/func/test_checkout.py @@ -136,7 +136,7 @@ def outs_info(self, stage): paths = [ path for output in stage["outs"] - for path in walk_files(output["path"], self.dvc.tree) + for path in walk_files(output["path"]) ] return [ diff --git a/tests/func/test_ignore.py b/tests/func/test_ignore.py index e12a78cf10..50b71cf479 100644 --- a/tests/func/test_ignore.py +++ b/tests/func/test_ignore.py @@ -11,7 +11,6 @@ class TestDvcIgnore(TestDvc): def setUp(self): super(TestDvcIgnore, self).setUp() - # self.ignore_file_handler = DvcIgnoreFileHandler(self.dvc.tree) def _get_all_paths(self): @@ -69,13 +68,11 @@ def test_metadata_unchanged_when_moving_ignored_file(dvc_repo, repo_dir): ), ) - mtime_sig, size = get_mtime_and_size(repo_dir.DATA_DIR, dvc_repo.tree) + mtime_sig, size = get_mtime_and_size(repo_dir.DATA_DIR) shutil.move(repo_dir.DATA_SUB, new_data_path) - new_mtime_sig, new_size = get_mtime_and_size( - repo_dir.DATA_DIR, dvc_repo.tree - ) + new_mtime_sig, new_size = get_mtime_and_size(repo_dir.DATA_DIR) assert new_mtime_sig == mtime_sig assert new_size == size @@ -83,11 +80,10 @@ def test_metadata_unchanged_when_moving_ignored_file(dvc_repo, repo_dir): def test_mtime_changed_when_moving_non_ignored_file(dvc_repo, repo_dir): new_data_path = repo_dir.DATA_SUB + "_new" - # ignore_handler = DvcIgnoreFileHandler(dvc_repo.tree) - mtime, size = get_mtime_and_size(repo_dir.DATA_DIR, dvc_repo.tree) + mtime, size = get_mtime_and_size(repo_dir.DATA_DIR) shutil.move(repo_dir.DATA_SUB, new_data_path) - new_mtime, new_size = get_mtime_and_size(repo_dir.DATA_DIR, dvc_repo.tree) + new_mtime, new_size = get_mtime_and_size(repo_dir.DATA_DIR) assert new_mtime != mtime assert new_size == size @@ -96,26 +92,20 @@ def test_mtime_changed_when_moving_non_ignored_file(dvc_repo, repo_dir): def test_metadata_unchanged_on_ignored_file_deletion(dvc_repo, repo_dir): ignore_file = os.path.join(dvc_repo.root_dir, DvcIgnore.DVCIGNORE_FILE) repo_dir.create(ignore_file, to_posixpath(repo_dir.DATA_SUB)) - # ignore_handler = DvcIgnoreFileHandler(dvc_repo.tree) - mtime_sig, size = get_mtime_and_size(repo_dir.DATA_DIR, dvc_repo.tree) + mtime_sig, size = get_mtime_and_size(repo_dir.DATA_DIR) os.remove(repo_dir.DATA_SUB) - new_mtime_sig, new_size = get_mtime_and_size( - repo_dir.DATA_DIR, dvc_repo.tree - ) + new_mtime_sig, new_size = get_mtime_and_size(repo_dir.DATA_DIR) assert new_mtime_sig == mtime_sig assert new_size == size def test_metadata_changed_on_non_ignored_file_deletion(dvc_repo, repo_dir): - # ignore_handler = DvcIgnoreFileHandler(dvc_repo.tree) - mtime_sig, size = get_mtime_and_size(repo_dir.DATA_DIR, dvc_repo.tree) + mtime_sig, size = get_mtime_and_size(repo_dir.DATA_DIR) os.remove(repo_dir.DATA_SUB) - new_mtime_sig, new_size = get_mtime_and_size( - repo_dir.DATA_DIR, dvc_repo.tree - ) + new_mtime_sig, new_size = get_mtime_and_size(repo_dir.DATA_DIR) assert new_mtime_sig != mtime_sig assert new_size != size diff --git a/tests/unit/test_ignore.py b/tests/unit/test_ignore.py index aa0b8109b5..c14bd3255d 100644 --- a/tests/unit/test_ignore.py +++ b/tests/unit/test_ignore.py @@ -5,19 +5,15 @@ from pathspec.patterns import GitWildMatchPattern from dvc.ignore import DvcIgnoreFromFile, DvcIgnoreDir, DvcIgnoreFile -from mock import patch, Mock +from mock import MagicMock def mock_dvcignore(dvcignore_path, patterns): - mock_ignore_file_handler = Mock() - with patch.object( - mock_ignore_file_handler, - "read_patterns", - return_value=PathSpec.from_lines(GitWildMatchPattern, patterns), - ): - ignore_file = DvcIgnoreFromFile( - dvcignore_path, mock_ignore_file_handler - ) + tree_mock = MagicMock() + ignore_file = DvcIgnoreFromFile(dvcignore_path, tree_mock) + ignore_file.ignore_spec = PathSpec.from_lines( + GitWildMatchPattern, patterns + ) return ignore_file From 9c7239d33068d009a43c7a6740f3618f23bf0a17 Mon Sep 17 00:00:00 2001 From: pared Date: Wed, 26 Jun 2019 14:22:32 +0200 Subject: [PATCH 05/14] ignore: handle DvcIgnoreFilter creation outside of DVC repo --- dvc/ignore.py | 34 +++++++++++++++++++--------------- dvc/utils/fs.py | 32 ++++++++++++++------------------ tests/func/test_ignore.py | 19 +++++++++++++++++++ 3 files changed, 52 insertions(+), 33 deletions(-) diff --git a/dvc/ignore.py b/dvc/ignore.py index 81cc5679c3..26fbb7758d 100644 --- a/dvc/ignore.py +++ b/dvc/ignore.py @@ -1,5 +1,6 @@ from __future__ import unicode_literals +import logging import os from pathspec import PathSpec from pathspec.patterns import GitWildMatchPattern @@ -9,6 +10,8 @@ from dvc.utils import relpath from dvc.utils.fs import get_parent_dirs_up_to +logger = logging.getLogger(__name__) + class DvcIgnore(object): DVCIGNORE_FILE = ".dvcignore" @@ -71,27 +74,28 @@ def __init__(self, wdir): from dvc.repo import Repo try: - self.tree = WorkingTree(Repo.find_root()) + self.tree = WorkingTree(Repo.find_root(wdir)) except NotDvcRepoError: - self.tree = None + logger.warning( + "Traversing directory outside of DvcRepo. " + "ignore files will be read from '{}' " + "downward.".format(wdir) + ) + self.tree = WorkingTree(os.path.abspath(wdir)) self._process_ignores_in_parent_dirs(wdir) def _process_ignores_in_parent_dirs(self, wdir): - if self.tree: - wdir = os.path.normpath(os.path.abspath(wdir)) - ignore_search_end_dir = self.tree.tree_root - parent_dirs = get_parent_dirs_up_to(wdir, ignore_search_end_dir) - for d in parent_dirs: - self.update(d) + wdir = os.path.normpath(os.path.abspath(wdir)) + ignore_search_end_dir = self.tree.tree_root + parent_dirs = get_parent_dirs_up_to(wdir, ignore_search_end_dir) + for d in parent_dirs: + self.update(d) def update(self, wdir): - if self.tree: - ignore_file_path = os.path.join(wdir, DvcIgnore.DVCIGNORE_FILE) - if self.tree.exists(ignore_file_path): - file_ignore = DvcIgnoreFromFile( - ignore_file_path, tree=self.tree - ) - self.ignores.append(file_ignore) + ignore_file_path = os.path.join(wdir, DvcIgnore.DVCIGNORE_FILE) + if self.tree.exists(ignore_file_path): + file_ignore = DvcIgnoreFromFile(ignore_file_path, tree=self.tree) + self.ignores.append(file_ignore) def __call__(self, root, dirs, files): self.update(root) diff --git a/dvc/utils/fs.py b/dvc/utils/fs.py index f5a95c127a..d29d2e22b2 100644 --- a/dvc/utils/fs.py +++ b/dvc/utils/fs.py @@ -7,7 +7,7 @@ from dvc.exceptions import DvcException from dvc.system import System -from dvc.utils import dvc_walk, dict_md5 +from dvc.utils import dict_md5, walk_files from dvc.utils.compat import str @@ -21,30 +21,26 @@ def get_inode(path): def get_mtime_and_size(path): - base_stat = os.stat(path) - size = 0 - if os.path.isdir(path): + size = 0 files_mtimes = {} - for root, dirs, files in dvc_walk(path): - - for file in files: - entry = os.path.join(root, file) - try: - stat = os.stat(entry) - except OSError as exc: - # NOTE: broken symlink case. - if exc.errno != errno.ENOENT: - raise - continue - size += stat.st_size - files_mtimes[entry] = stat.st_mtime + for file_path in walk_files(path): + try: + stat = os.stat(file_path) + except OSError as exc: + # NOTE: broken symlink case. + if exc.errno != errno.ENOENT: + raise + continue + size += stat.st_size + files_mtimes[file_path] = stat.st_mtime # We track file changes and moves, which cannot be detected with simply # max(mtime(f) for f in non_ignored_files) mtime = dict_md5(files_mtimes) else: - size += base_stat.st_size + base_stat = os.stat(path) + size = base_stat.st_size mtime = base_stat.st_mtime mtime = int(nanotime.timestamp(mtime)) diff --git a/tests/func/test_ignore.py b/tests/func/test_ignore.py index 50b71cf479..d7811a808a 100644 --- a/tests/func/test_ignore.py +++ b/tests/func/test_ignore.py @@ -109,3 +109,22 @@ def test_metadata_changed_on_non_ignored_file_deletion(dvc_repo, repo_dir): assert new_mtime_sig != mtime_sig assert new_size != size + + +def test_ignore_should_work_for_external_dependency(dvc_repo, repo_dir): + external_data_dir = repo_dir.mkdtemp() + data = os.path.join(external_data_dir, "data") + ignored_file = "data_ignored" + data_ignored = os.path.join(external_data_dir, ignored_file) + ignore_file = os.path.join(external_data_dir, DvcIgnore.DVCIGNORE_FILE) + repo_dir.create(data, "data") + repo_dir.create(data_ignored, "ignore this") + repo_dir.create(ignore_file, ignored_file) + + dvc_repo.add(external_data_dir) + + assert dvc_repo.status() == {} + + os.remove(data_ignored) + + assert dvc_repo.status() == {} From f83b988a96f8576990827f354dde401372d23e9e Mon Sep 17 00:00:00 2001 From: pared Date: Mon, 1 Jul 2019 18:39:29 +0200 Subject: [PATCH 06/14] ignore: throw on dvcignore file under output dir, but not dependency dir --- dvc/dependency/local.py | 4 +++- dvc/exceptions.py | 5 +++++ dvc/ignore.py | 27 +++++++++++++++++++++------ dvc/output/base.py | 14 +++++++++++--- dvc/remote/base.py | 32 +++++++++++++++++--------------- dvc/remote/local/__init__.py | 10 +++++++--- dvc/remote/ssh/__init__.py | 2 +- dvc/repo/add.py | 2 +- dvc/stage.py | 2 ++ dvc/state.py | 12 ++++++------ dvc/utils/__init__.py | 18 ++++++++++++++---- dvc/utils/fs.py | 7 ++++--- tests/func/test_ignore.py | 30 +++++++++++++++++++++++++----- tests/unit/utils/test_fs.py | 1 - 14 files changed, 117 insertions(+), 49 deletions(-) diff --git a/dvc/dependency/local.py b/dvc/dependency/local.py index 60fb124c08..a438a39aa6 100644 --- a/dvc/dependency/local.py +++ b/dvc/dependency/local.py @@ -5,4 +5,6 @@ class DependencyLOCAL(DependencyBase, OutputLOCAL): - pass + @property + def dvcignore_raises(self): + return False diff --git a/dvc/exceptions.py b/dvc/exceptions.py index 6b38af4ead..e47f883e97 100644 --- a/dvc/exceptions.py +++ b/dvc/exceptions.py @@ -270,3 +270,8 @@ def __init__(self, path): super(OutputFileMissingError, self).__init__( "Can't find {} neither locally nor on remote".format(path) ) + + +class DvcIgnoreError(DvcException): + def __init__(self, msg): + super(DvcIgnoreError, self).__init__(msg) diff --git a/dvc/ignore.py b/dvc/ignore.py index 26fbb7758d..33734d64f6 100644 --- a/dvc/ignore.py +++ b/dvc/ignore.py @@ -5,7 +5,7 @@ from pathspec import PathSpec from pathspec.patterns import GitWildMatchPattern -from dvc.exceptions import NotDvcRepoError +from dvc.exceptions import NotDvcRepoError, DvcIgnoreError from dvc.scm.tree import WorkingTree from dvc.utils import relpath from dvc.utils.fs import get_parent_dirs_up_to @@ -63,7 +63,7 @@ def __call__(self, root, dirs, files): class DvcIgnoreFilter(object): - def __init__(self, wdir): + def __init__(self, wdir, raise_on_dvcignore_below_top=False): self.ignores = [ DvcIgnoreDir(".git"), DvcIgnoreDir(".hg"), @@ -76,29 +76,44 @@ def __init__(self, wdir): try: self.tree = WorkingTree(Repo.find_root(wdir)) except NotDvcRepoError: - logger.warning( + logger.error( "Traversing directory outside of DvcRepo. " "ignore files will be read from '{}' " "downward.".format(wdir) ) + # TODO is it feasible now that we want to raise on external dir? self.tree = WorkingTree(os.path.abspath(wdir)) + + self.raise_on_dvcignore_below_top = raise_on_dvcignore_below_top + self._process_ignores_in_parent_dirs(wdir) def _process_ignores_in_parent_dirs(self, wdir): wdir = os.path.normpath(os.path.abspath(wdir)) ignore_search_end_dir = self.tree.tree_root parent_dirs = get_parent_dirs_up_to(wdir, ignore_search_end_dir) + + if not self.raise_on_dvcignore_below_top: + parent_dirs.append(wdir) + for d in parent_dirs: - self.update(d) + self.update(d, False) - def update(self, wdir): + def update(self, wdir, raise_on_dvcignore): ignore_file_path = os.path.join(wdir, DvcIgnore.DVCIGNORE_FILE) if self.tree.exists(ignore_file_path): + + if raise_on_dvcignore: + raise DvcIgnoreError( + "Found dvcignore file in directory where it " + "should not be: '{}'".format(wdir) + ) + file_ignore = DvcIgnoreFromFile(ignore_file_path, tree=self.tree) self.ignores.append(file_ignore) def __call__(self, root, dirs, files): - self.update(root) + self.update(root, self.raise_on_dvcignore_below_top) for ignore in self.ignores: dirs, files = ignore(root, dirs, files) diff --git a/dvc/output/base.py b/dvc/output/base.py index 65439e83b9..62c5711812 100644 --- a/dvc/output/base.py +++ b/dvc/output/base.py @@ -156,10 +156,14 @@ def is_dir_checksum(self): def exists(self): return self.remote.exists(self.path_info) + @property + def dvcignore_raises(self): + return True + def changed_checksum(self): return ( self.checksum - != self.remote.save_info(self.path_info)[ + != self.remote.save_info(self.path_info, self.dvcignore_raises)[ self.remote.PARAM_CHECKSUM ] ) @@ -211,7 +215,9 @@ def save(self): logger.warning("'{}' is empty.".format(self)) if not self.use_cache: - self.info = self.remote.save_info(self.path_info) + self.info = self.remote.save_info( + self.path_info, self.dvcignore_raises + ) if self.metric: self.verify_metric() if not self.IS_DEPENDENCY: @@ -237,7 +243,9 @@ def save(self): if self.use_cache: self.repo.scm.ignore(self.fspath) - self.info = self.remote.save_info(self.path_info) + self.info = self.remote.save_info( + self.path_info, self.dvcignore_raises + ) def commit(self): if self.use_cache: diff --git a/dvc/remote/base.py b/dvc/remote/base.py index e56171d0ec..fae34931df 100644 --- a/dvc/remote/base.py +++ b/dvc/remote/base.py @@ -141,12 +141,13 @@ def cache(self): def get_file_checksum(self, path_info): raise NotImplementedError - def _collect_dir(self, path_info): + def _collect_dir(self, path_info, dvcignore_raises): dir_info = {} with ThreadPoolExecutor(max_workers=self.checksum_jobs) as executor: - for root, _dirs, files in self.walk(path_info): - root_info = path_info / root + for root, _dirs, files in self.walk(path_info, dvcignore_raises): + root_info = path_info / root + for fname in files: file_info = root_info / fname @@ -183,15 +184,15 @@ def _collect_dir(self, path_info): # NOTE: sorting the list by path to ensure reproducibility return sorted(dir_info.values(), key=itemgetter(self.PARAM_RELPATH)) - def get_dir_checksum(self, path_info): - dir_info = self._collect_dir(path_info) + def get_dir_checksum(self, path_info, dvcignore_raises=True): + dir_info = self._collect_dir(path_info, dvcignore_raises) checksum, tmp_info = self._get_dir_info_checksum(dir_info) new_info = self.cache.checksum_to_path_info(checksum) if self.cache.changed_cache_file(checksum): self.cache.move(tmp_info, new_info) - self.state.save(path_info, checksum) - self.state.save(new_info, checksum) + self.state.save(path_info, checksum, dvcignore_raises) + self.state.save(new_info, checksum, dvcignore_raises) return checksum @@ -251,11 +252,11 @@ def load_dir_cache(self, checksum): def is_dir_checksum(cls, checksum): return checksum.endswith(cls.CHECKSUM_DIR_SUFFIX) - def get_checksum(self, path_info): + def get_checksum(self, path_info, dvcignore_raises=True): if not self.exists(path_info): return None - checksum = self.state.get(path_info) + checksum = self.state.get(path_info, dvcignore_raises) # If we have dir checksum in state db, but dir cache file is lost, # then we need to recollect the dir via .get_dir_checksum() call below, @@ -266,23 +267,24 @@ def get_checksum(self, path_info): and not self.exists(self.cache.checksum_to_path_info(checksum)) ): checksum = None - if checksum: return checksum if self.isdir(path_info): - checksum = self.get_dir_checksum(path_info) + checksum = self.get_dir_checksum(path_info, dvcignore_raises) else: checksum = self.get_file_checksum(path_info) if checksum: - self.state.save(path_info, checksum) + self.state.save(path_info, checksum, dvcignore_raises) return checksum - def save_info(self, path_info): + def save_info(self, path_info, dvcignore_raises=True): assert path_info.scheme == self.scheme - return {self.PARAM_CHECKSUM: self.get_checksum(path_info)} + return { + self.PARAM_CHECKSUM: self.get_checksum(path_info, dvcignore_raises) + } def changed(self, path_info, checksum_info): """Checks if data has changed. @@ -379,7 +381,7 @@ def isfile(self, path_info): def isdir(self, path_info): return False - def walk(self, path_info): + def walk(self, path_info, dvcignore_raises=True): raise NotImplementedError @staticmethod diff --git a/dvc/remote/local/__init__.py b/dvc/remote/local/__init__.py index 1f82690157..7c157ab1c5 100644 --- a/dvc/remote/local/__init__.py +++ b/dvc/remote/local/__init__.py @@ -29,6 +29,7 @@ file_md5, walk_files, relpath, + dvc_walk, ) from dvc.config import Config from dvc.exceptions import DvcException @@ -221,8 +222,11 @@ def isfile(self, path_info): def isdir(self, path_info): return os.path.isdir(fspath_py35(path_info)) - def walk(self, path_info): - return os.walk(fspath_py35(path_info)) + def walk(self, path_info, dvcignore_raises=True): + return dvc_walk( + fspath_py35(path_info), + raise_on_dvcignore_below_top=dvcignore_raises, + ) def get_file_checksum(self, path_info): return file_md5(fspath_py35(path_info))[0] @@ -479,7 +483,7 @@ def _unprotect_file(path): @staticmethod def _unprotect_dir(path): - for path in walk_files(path): + for path in walk_files(path, raise_on_dvcignore_below_top=True): RemoteLOCAL._unprotect_file(path) @staticmethod diff --git a/dvc/remote/ssh/__init__.py b/dvc/remote/ssh/__init__.py index 07fdb910fa..f8997ee217 100644 --- a/dvc/remote/ssh/__init__.py +++ b/dvc/remote/ssh/__init__.py @@ -234,7 +234,7 @@ def list_cache_paths(self): with self.ssh(self.path_info) as ssh: return list(ssh.walk_files(self.path_info.path)) - def walk(self, path_info): + def walk(self, path_info, dvcignore_raises=True): with self.ssh(path_info) as ssh: for entry in ssh.walk(path_info.path): yield entry diff --git a/dvc/repo/add.py b/dvc/repo/add.py index 0b02a19aba..daccd2c43e 100644 --- a/dvc/repo/add.py +++ b/dvc/repo/add.py @@ -44,7 +44,7 @@ def _find_all_targets(repo, target, recursive): if os.path.isdir(target) and recursive: return [ fname - for fname in walk_files(target) + for fname in walk_files(target, raise_on_dvcignore_below_top=True) if not repo.is_dvc_internal(fname) if not Stage.is_stage_file(fname) if not repo.scm.belongs_to_scm(fname) diff --git a/dvc/stage.py b/dvc/stage.py index 9ca9256e63..376bfd37a9 100644 --- a/dvc/stage.py +++ b/dvc/stage.py @@ -401,6 +401,7 @@ def is_cached(self): # NOTE: need to save checksums for deps in order to compare them # with what is written in the old stage. + # TODO this should use dvcignore for dep in self.deps: dep.save() @@ -706,6 +707,7 @@ def _compute_md5(self): return m def save(self): + # TODO should we use dvcignore here? for dep in self.deps: dep.save() diff --git a/dvc/state.py b/dvc/state.py index 3f03d084e2..67c097d551 100644 --- a/dvc/state.py +++ b/dvc/state.py @@ -32,10 +32,10 @@ def __init__(self, dvc_version, expected, actual): class StateBase(object): - def save(self, path_info, checksum): + def save(self, path_info, checksum, dvcignore_raises=True): pass - def get(self, path_info): + def get(self, path_info, dvcignore_raises=True): return None def save_link(self, path_info): @@ -342,7 +342,7 @@ def get_state_record_for_inode(self, inode): return results[0] return None - def save(self, path_info, checksum): + def save(self, path_info, checksum, dvcignore_raises=True): """Save checksum for the specified path info. Args: @@ -355,7 +355,7 @@ def save(self, path_info, checksum): path = fspath_py35(path_info) assert os.path.exists(path) - actual_mtime, actual_size = get_mtime_and_size(path) + actual_mtime, actual_size = get_mtime_and_size(path, dvcignore_raises) actual_inode = get_inode(path) existing_record = self.get_state_record_for_inode(actual_inode) @@ -369,7 +369,7 @@ def save(self, path_info, checksum): actual_inode, actual_mtime, actual_size, checksum ) - def get(self, path_info): + def get(self, path_info, dvcignore_raises=True): """Gets the checksum for the specified path info. Checksum will be retrieved from the state database if available. @@ -386,7 +386,7 @@ def get(self, path_info): if not os.path.exists(path): return None - actual_mtime, actual_size = get_mtime_and_size(path) + actual_mtime, actual_size = get_mtime_and_size(path, dvcignore_raises) actual_inode = get_inode(path) existing_record = self.get_state_record_for_inode(actual_inode) diff --git a/dvc/utils/__init__.py b/dvc/utils/__init__.py index 15e4935f13..778e091a82 100644 --- a/dvc/utils/__init__.py +++ b/dvc/utils/__init__.py @@ -272,7 +272,13 @@ def to_yaml_string(data): return stream.getvalue() -def dvc_walk(top, topdown=True, onerror=None, followlinks=False): +def dvc_walk( + top, + topdown=True, + onerror=None, + followlinks=False, + raise_on_dvcignore_below_top=False, +): """ Proxy for `os.walk` directory tree generator. Utilizes DvcIgnoreFilter functionality. @@ -281,7 +287,9 @@ def dvc_walk(top, topdown=True, onerror=None, followlinks=False): if topdown: from dvc.ignore import DvcIgnoreFilter - ignore_filter = DvcIgnoreFilter(top) + ignore_filter = DvcIgnoreFilter( + top, raise_on_dvcignore_below_top=raise_on_dvcignore_below_top + ) for root, dirs, files in os.walk( top, topdown=topdown, onerror=onerror, followlinks=followlinks @@ -293,8 +301,10 @@ def dvc_walk(top, topdown=True, onerror=None, followlinks=False): yield root, dirs, files -def walk_files(directory): - for root, _, files in dvc_walk(directory): +def walk_files(directory, raise_on_dvcignore_below_top=False): + for root, _, files in dvc_walk( + directory, raise_on_dvcignore_below_top=raise_on_dvcignore_below_top + ): for f in files: yield os.path.join(root, f) diff --git a/dvc/utils/fs.py b/dvc/utils/fs.py index d29d2e22b2..f2e7a3e664 100644 --- a/dvc/utils/fs.py +++ b/dvc/utils/fs.py @@ -20,11 +20,13 @@ def get_inode(path): return inode -def get_mtime_and_size(path): +def get_mtime_and_size(path, dvcignore_raises=True): if os.path.isdir(path): size = 0 files_mtimes = {} - for file_path in walk_files(path): + for file_path in walk_files( + path, raise_on_dvcignore_below_top=dvcignore_raises + ): try: stat = os.stat(file_path) except OSError as exc: @@ -81,7 +83,6 @@ def get_parent_dirs_up_to(wdir, root_dir): return [] dirs = [] - dirs.append(wdir) while wdir != root_dir: wdir = os.path.dirname(wdir) dirs.append(wdir) diff --git a/tests/func/test_ignore.py b/tests/func/test_ignore.py index d7811a808a..cf7c2e0bb9 100644 --- a/tests/func/test_ignore.py +++ b/tests/func/test_ignore.py @@ -1,6 +1,9 @@ import os import shutil +import pytest + +from dvc.exceptions import DvcIgnoreError from dvc.ignore import DvcIgnore from dvc.utils.compat import cast_bytes from dvc.utils.fs import get_mtime_and_size @@ -111,7 +114,7 @@ def test_metadata_changed_on_non_ignored_file_deletion(dvc_repo, repo_dir): assert new_size != size -def test_ignore_should_work_for_external_dependency(dvc_repo, repo_dir): +def test_ignore_should_raise_for_external_dependency(dvc_repo, repo_dir): external_data_dir = repo_dir.mkdtemp() data = os.path.join(external_data_dir, "data") ignored_file = "data_ignored" @@ -121,10 +124,27 @@ def test_ignore_should_work_for_external_dependency(dvc_repo, repo_dir): repo_dir.create(data_ignored, "ignore this") repo_dir.create(ignore_file, ignored_file) - dvc_repo.add(external_data_dir) + with pytest.raises(DvcIgnoreError): + dvc_repo.add(external_data_dir) + + +def test_ignore_should_raise_on_ignore_file_below_output_dir( + dvc_repo, repo_dir +): + ignore_file = os.path.join(repo_dir.DATA_DIR, DvcIgnore.DVCIGNORE_FILE) + ignore_file_content = os.path.basename(repo_dir.DATA) + repo_dir.create(ignore_file, ignore_file_content) + + with pytest.raises(DvcIgnoreError): + dvc_repo.add(repo_dir.DATA_DIR) - assert dvc_repo.status() == {} - os.remove(data_ignored) +def test_ignore_should_not_raise_on_ignore_in_dependency_dir( + dvc_repo, repo_dir +): + ignore_file = os.path.join(repo_dir.DATA_DIR, DvcIgnore.DVCIGNORE_FILE) + ignore_file_content = os.path.basename(repo_dir.DATA) + repo_dir.create(ignore_file, ignore_file_content) - assert dvc_repo.status() == {} + stage_file = "stage.dvc" + dvc_repo.run(fname=stage_file, deps=[repo_dir.DATA_DIR]) diff --git a/tests/unit/utils/test_fs.py b/tests/unit/utils/test_fs.py index abcb5bc25f..18a30e4f9d 100644 --- a/tests/unit/utils/test_fs.py +++ b/tests/unit/utils/test_fs.py @@ -125,7 +125,6 @@ def test_get_parent_dirs_up_to_should_raise_on_no_absolute(path1, path2): [ os.path.join(os.sep, "some"), os.path.join(os.sep, "some", "long"), - os.path.join(os.sep, "some", "long", "path"), ], ), ], From 3cf01b843faa82f81ebebc5ef0e45b4c4f36e3a1 Mon Sep 17 00:00:00 2001 From: pared Date: Wed, 3 Jul 2019 14:01:28 +0200 Subject: [PATCH 07/14] ignore: move ignore initialization to Repo init --- dvc/dependency/local.py | 4 +- dvc/exceptions.py | 5 --- dvc/ignore.py | 74 ++++++++++++++---------------------- dvc/output/base.py | 14 ++----- dvc/remote/base.py | 28 +++++++------- dvc/remote/local/__init__.py | 17 +++------ dvc/remote/ssh/__init__.py | 2 +- dvc/repo/__init__.py | 56 +++++++++++++++++++++------ dvc/repo/add.py | 2 +- dvc/scm/git/tree.py | 10 +++-- dvc/scm/tree.py | 14 +++++-- dvc/state.py | 20 ++++++---- dvc/utils/__init__.py | 24 +++--------- dvc/utils/fs.py | 6 +-- tests/func/test_ignore.py | 74 ++++++++++++++++++++++++------------ tests/func/test_repo.py | 3 ++ tests/unit/test_ignore.py | 13 +++---- 17 files changed, 193 insertions(+), 173 deletions(-) diff --git a/dvc/dependency/local.py b/dvc/dependency/local.py index a438a39aa6..60fb124c08 100644 --- a/dvc/dependency/local.py +++ b/dvc/dependency/local.py @@ -5,6 +5,4 @@ class DependencyLOCAL(DependencyBase, OutputLOCAL): - @property - def dvcignore_raises(self): - return False + pass diff --git a/dvc/exceptions.py b/dvc/exceptions.py index e47f883e97..6b38af4ead 100644 --- a/dvc/exceptions.py +++ b/dvc/exceptions.py @@ -270,8 +270,3 @@ def __init__(self, path): super(OutputFileMissingError, self).__init__( "Can't find {} neither locally nor on remote".format(path) ) - - -class DvcIgnoreError(DvcException): - def __init__(self, msg): - super(DvcIgnoreError, self).__init__(msg) diff --git a/dvc/ignore.py b/dvc/ignore.py index 33734d64f6..d2caa268ad 100644 --- a/dvc/ignore.py +++ b/dvc/ignore.py @@ -5,10 +5,6 @@ from pathspec import PathSpec from pathspec.patterns import GitWildMatchPattern -from dvc.exceptions import NotDvcRepoError, DvcIgnoreError -from dvc.scm.tree import WorkingTree -from dvc.utils import relpath -from dvc.utils.fs import get_parent_dirs_up_to logger = logging.getLogger(__name__) @@ -25,8 +21,28 @@ def __init__(self, ignore_file_path, tree): self.ignore_file_path = ignore_file_path self.dirname = os.path.normpath(os.path.dirname(ignore_file_path)) + # TODO maybe we should readlines, join with ignore_file_path.dirname + # and then parse? with tree.open(ignore_file_path) as fobj: - self.ignore_spec = PathSpec.from_lines(GitWildMatchPattern, fobj) + ignore_lines = fobj.readlines() + + def full_path_pattern(pattern): + negation = False + if not pattern.startswith("/"): + if pattern.startswith("!"): + pattern = pattern[1:] + negation = True + pattern = os.path.join(self.dirname, pattern) + # TODO NOTE: need to escape beggining of path + pattern = "/" + pattern + if negation: + pattern = "!" + pattern + return pattern + + ignore_lines = [full_path_pattern(p) for p in ignore_lines] + self.ignore_spec = PathSpec.from_lines( + GitWildMatchPattern, ignore_lines + ) def __call__(self, root, dirs, files): files = [f for f in files if not self.matches(root, f)] @@ -35,9 +51,10 @@ def __call__(self, root, dirs, files): return dirs, files def matches(self, dirname, basename): - abs_path = os.path.join(dirname, basename) - relative_path = relpath(abs_path, self.dirname) - return self.ignore_spec.match_file(relative_path) + abs_path = os.path.abspath(os.path.join(dirname, basename)) + # relative_path = relpath(abs_path, self.dirname) + result = self.ignore_spec.match_file(abs_path) + return result def __hash__(self): return hash(self.ignore_file_path) @@ -63,7 +80,7 @@ def __call__(self, root, dirs, files): class DvcIgnoreFilter(object): - def __init__(self, wdir, raise_on_dvcignore_below_top=False): + def __init__(self, tree): self.ignores = [ DvcIgnoreDir(".git"), DvcIgnoreDir(".hg"), @@ -71,50 +88,15 @@ def __init__(self, wdir, raise_on_dvcignore_below_top=False): DvcIgnoreFile(".dvcignore"), ] - from dvc.repo import Repo + self.tree = tree - try: - self.tree = WorkingTree(Repo.find_root(wdir)) - except NotDvcRepoError: - logger.error( - "Traversing directory outside of DvcRepo. " - "ignore files will be read from '{}' " - "downward.".format(wdir) - ) - # TODO is it feasible now that we want to raise on external dir? - self.tree = WorkingTree(os.path.abspath(wdir)) - - self.raise_on_dvcignore_below_top = raise_on_dvcignore_below_top - - self._process_ignores_in_parent_dirs(wdir) - - def _process_ignores_in_parent_dirs(self, wdir): - wdir = os.path.normpath(os.path.abspath(wdir)) - ignore_search_end_dir = self.tree.tree_root - parent_dirs = get_parent_dirs_up_to(wdir, ignore_search_end_dir) - - if not self.raise_on_dvcignore_below_top: - parent_dirs.append(wdir) - - for d in parent_dirs: - self.update(d, False) - - def update(self, wdir, raise_on_dvcignore): + def update(self, wdir): ignore_file_path = os.path.join(wdir, DvcIgnore.DVCIGNORE_FILE) if self.tree.exists(ignore_file_path): - - if raise_on_dvcignore: - raise DvcIgnoreError( - "Found dvcignore file in directory where it " - "should not be: '{}'".format(wdir) - ) - file_ignore = DvcIgnoreFromFile(ignore_file_path, tree=self.tree) self.ignores.append(file_ignore) def __call__(self, root, dirs, files): - self.update(root, self.raise_on_dvcignore_below_top) - for ignore in self.ignores: dirs, files = ignore(root, dirs, files) diff --git a/dvc/output/base.py b/dvc/output/base.py index 62c5711812..65439e83b9 100644 --- a/dvc/output/base.py +++ b/dvc/output/base.py @@ -156,14 +156,10 @@ def is_dir_checksum(self): def exists(self): return self.remote.exists(self.path_info) - @property - def dvcignore_raises(self): - return True - def changed_checksum(self): return ( self.checksum - != self.remote.save_info(self.path_info, self.dvcignore_raises)[ + != self.remote.save_info(self.path_info)[ self.remote.PARAM_CHECKSUM ] ) @@ -215,9 +211,7 @@ def save(self): logger.warning("'{}' is empty.".format(self)) if not self.use_cache: - self.info = self.remote.save_info( - self.path_info, self.dvcignore_raises - ) + self.info = self.remote.save_info(self.path_info) if self.metric: self.verify_metric() if not self.IS_DEPENDENCY: @@ -243,9 +237,7 @@ def save(self): if self.use_cache: self.repo.scm.ignore(self.fspath) - self.info = self.remote.save_info( - self.path_info, self.dvcignore_raises - ) + self.info = self.remote.save_info(self.path_info) def commit(self): if self.use_cache: diff --git a/dvc/remote/base.py b/dvc/remote/base.py index fae34931df..d89d37c509 100644 --- a/dvc/remote/base.py +++ b/dvc/remote/base.py @@ -141,11 +141,11 @@ def cache(self): def get_file_checksum(self, path_info): raise NotImplementedError - def _collect_dir(self, path_info, dvcignore_raises): + def _collect_dir(self, path_info): dir_info = {} with ThreadPoolExecutor(max_workers=self.checksum_jobs) as executor: - for root, _dirs, files in self.walk(path_info, dvcignore_raises): + for root, _dirs, files in self.walk(path_info): root_info = path_info / root @@ -184,15 +184,15 @@ def _collect_dir(self, path_info, dvcignore_raises): # NOTE: sorting the list by path to ensure reproducibility return sorted(dir_info.values(), key=itemgetter(self.PARAM_RELPATH)) - def get_dir_checksum(self, path_info, dvcignore_raises=True): - dir_info = self._collect_dir(path_info, dvcignore_raises) + def get_dir_checksum(self, path_info): + dir_info = self._collect_dir(path_info) checksum, tmp_info = self._get_dir_info_checksum(dir_info) new_info = self.cache.checksum_to_path_info(checksum) if self.cache.changed_cache_file(checksum): self.cache.move(tmp_info, new_info) - self.state.save(path_info, checksum, dvcignore_raises) - self.state.save(new_info, checksum, dvcignore_raises) + self.state.save(path_info, checksum) + self.state.save(new_info, checksum) return checksum @@ -252,11 +252,11 @@ def load_dir_cache(self, checksum): def is_dir_checksum(cls, checksum): return checksum.endswith(cls.CHECKSUM_DIR_SUFFIX) - def get_checksum(self, path_info, dvcignore_raises=True): + def get_checksum(self, path_info): if not self.exists(path_info): return None - checksum = self.state.get(path_info, dvcignore_raises) + checksum = self.state.get(path_info) # If we have dir checksum in state db, but dir cache file is lost, # then we need to recollect the dir via .get_dir_checksum() call below, @@ -271,20 +271,18 @@ def get_checksum(self, path_info, dvcignore_raises=True): return checksum if self.isdir(path_info): - checksum = self.get_dir_checksum(path_info, dvcignore_raises) + checksum = self.get_dir_checksum(path_info) else: checksum = self.get_file_checksum(path_info) if checksum: - self.state.save(path_info, checksum, dvcignore_raises) + self.state.save(path_info, checksum) return checksum - def save_info(self, path_info, dvcignore_raises=True): + def save_info(self, path_info): assert path_info.scheme == self.scheme - return { - self.PARAM_CHECKSUM: self.get_checksum(path_info, dvcignore_raises) - } + return {self.PARAM_CHECKSUM: self.get_checksum(path_info)} def changed(self, path_info, checksum_info): """Checks if data has changed. @@ -381,7 +379,7 @@ def isfile(self, path_info): def isdir(self, path_info): return False - def walk(self, path_info, dvcignore_raises=True): + def walk(self, path_info): raise NotImplementedError @staticmethod diff --git a/dvc/remote/local/__init__.py b/dvc/remote/local/__init__.py index 7c157ab1c5..79afe2ef09 100644 --- a/dvc/remote/local/__init__.py +++ b/dvc/remote/local/__init__.py @@ -222,11 +222,8 @@ def isfile(self, path_info): def isdir(self, path_info): return os.path.isdir(fspath_py35(path_info)) - def walk(self, path_info, dvcignore_raises=True): - return dvc_walk( - fspath_py35(path_info), - raise_on_dvcignore_below_top=dvcignore_raises, - ) + def walk(self, path_info): + return dvc_walk(fspath_py35(path_info), dvcignore=self.repo.dvcignore) def get_file_checksum(self, path_info): return file_md5(fspath_py35(path_info))[0] @@ -481,13 +478,11 @@ def _unprotect_file(path): os.chmod(path, os.stat(path).st_mode | stat.S_IWRITE) - @staticmethod - def _unprotect_dir(path): - for path in walk_files(path, raise_on_dvcignore_below_top=True): + def _unprotect_dir(self, path): + for path in walk_files(path, self.repo.dvcignore): RemoteLOCAL._unprotect_file(path) - @staticmethod - def unprotect(path_info): + def unprotect(self, path_info): path = path_info.fspath if not os.path.exists(path): raise DvcException( @@ -495,7 +490,7 @@ def unprotect(path_info): ) if os.path.isdir(path): - RemoteLOCAL._unprotect_dir(path) + self._unprotect_dir(path) else: RemoteLOCAL._unprotect_file(path) diff --git a/dvc/remote/ssh/__init__.py b/dvc/remote/ssh/__init__.py index f8997ee217..07fdb910fa 100644 --- a/dvc/remote/ssh/__init__.py +++ b/dvc/remote/ssh/__init__.py @@ -234,7 +234,7 @@ def list_cache_paths(self): with self.ssh(self.path_info) as ssh: return list(ssh.walk_files(self.path_info.path)) - def walk(self, path_info, dvcignore_raises=True): + def walk(self, path_info): with self.ssh(path_info) as ssh: for entry in ssh.walk(path_info.path): yield entry diff --git a/dvc/repo/__init__.py b/dvc/repo/__init__.py index fd974682b1..8817220a61 100644 --- a/dvc/repo/__init__.py +++ b/dvc/repo/__init__.py @@ -12,6 +12,7 @@ TargetNotDirectoryError, OutputFileMissingError, ) +from dvc.ignore import DvcIgnoreFilter from dvc.path_info import PathInfo from dvc.utils.compat import open as _open, fspath_py35 from dvc.utils import relpath @@ -84,6 +85,7 @@ def __init__(self, root_dir=None): self.tag = Tag(self) self._ignore() + self.dvcignore = self.load_dvcignores() def __repr__(self): return "Repo: '{root_dir}'".format(root_dir=self.root_dir) @@ -355,6 +357,44 @@ def pipelines(self, from_directory=None): G.subgraph(c).copy() for c in nx.weakly_connected_components(G) ] + def make_out_dir_filter(self, outs, root_dir): + def filter_dirs(dname): + path = os.path.join(root_dir, dname) + if path in (self.dvc_dir, self.scm.dir): + return False + for out in outs: + if path == os.path.normpath(out) or path.startswith(out): + return False + return True + + return filter_dirs + + def load_dvcignores(self): + from dvc.stage import Stage + + outs = [] + # TODO maybe dvcignore should be a part of WorkingTree? + dvcignore = DvcIgnoreFilter(self.tree) + for root, dirs, files in self.tree.walk(self.root_dir): + for fname in files: + path = os.path.join(root, fname) + if not Stage.is_valid_filename(path): + continue + try: + stage = Stage.load(self, path) + + for out in stage.outs: + if out.scheme == "local": + outs.append(out.fspath + out.sep) + # TODO + except Exception: + pass + out_dir_filter = self.make_out_dir_filter(outs, root) + dirs[:] = list(filter(out_dir_filter, dirs)) + dvcignore.update(root) + + return dvcignore + def stages(self, from_directory=None, check_dag=True): """ Walks down the root directory looking for Dvcfiles, @@ -375,7 +415,9 @@ def stages(self, from_directory=None, check_dag=True): stages = [] outs = [] - for root, dirs, files in self.tree.walk(from_directory): + for root, dirs, files in self.tree.walk( + from_directory, dvcignore=self.dvcignore + ): for fname in files: path = os.path.join(root, fname) if not Stage.is_valid_filename(path): @@ -386,16 +428,8 @@ def stages(self, from_directory=None, check_dag=True): outs.append(out.fspath + out.sep) stages.append(stage) - def filter_dirs(dname, root=root): - path = os.path.join(root, dname) - if path in (self.dvc_dir, self.scm.dir): - return False - for out in outs: - if path == os.path.normpath(out) or path.startswith(out): - return False - return True - - dirs[:] = list(filter(filter_dirs, dirs)) + out_dir_filter = self.make_out_dir_filter(outs, root) + dirs[:] = list(filter(out_dir_filter, dirs)) if check_dag: self.check_dag(stages) diff --git a/dvc/repo/add.py b/dvc/repo/add.py index daccd2c43e..0092fcc58b 100644 --- a/dvc/repo/add.py +++ b/dvc/repo/add.py @@ -44,7 +44,7 @@ def _find_all_targets(repo, target, recursive): if os.path.isdir(target) and recursive: return [ fname - for fname in walk_files(target, raise_on_dvcignore_below_top=True) + for fname in walk_files(target, repo.dvcignore) if not repo.is_dvc_internal(fname) if not Stage.is_stage_file(fname) if not repo.scm.belongs_to_scm(fname) diff --git a/dvc/scm/git/tree.py b/dvc/scm/git/tree.py index 8c9bba3bfd..ef8d7a8841 100644 --- a/dvc/scm/git/tree.py +++ b/dvc/scm/git/tree.py @@ -1,7 +1,6 @@ import errno import os -from dvc.ignore import DvcIgnoreFilter from dvc.utils import relpath from dvc.utils.compat import StringIO, BytesIO from dvc.exceptions import DvcException @@ -104,6 +103,7 @@ def git_object_by_path(self, path): return tree def _walk(self, tree, topdown=True, dvc_ignore_filter=None): + # TODO how to handle dvcignore here? dirs, nondirs = [], [] for i in tree: if i.mode == GIT_MODE_DIR: @@ -113,8 +113,10 @@ def _walk(self, tree, topdown=True, dvc_ignore_filter=None): if topdown: if not dvc_ignore_filter: - dvc_ignore_filter = DvcIgnoreFilter(tree.abspath) - dirs, nondirs = dvc_ignore_filter(tree.path, dirs, nondirs) + # dvc_ignore_filter = DvcIgnoreFilter(tree.abspath) + pass + if dvc_ignore_filter: + dirs, nondirs = dvc_ignore_filter(tree.path, dirs, nondirs) yield os.path.normpath(tree.abspath), dirs, nondirs for i in dirs: @@ -126,7 +128,7 @@ def _walk(self, tree, topdown=True, dvc_ignore_filter=None): if not topdown: yield os.path.normpath(tree.abspath), dirs, nondirs - def walk(self, top, topdown=True): + def walk(self, top, topdown=True, dvcignore=None): """Directory tree generator. See `os.walk` for the docs. Differences: diff --git a/dvc/scm/tree.py b/dvc/scm/tree.py index c12e12b9b3..9d18b5e738 100644 --- a/dvc/scm/tree.py +++ b/dvc/scm/tree.py @@ -23,7 +23,7 @@ def isdir(self, path): def isfile(self, path): """Test whether a path is a regular file""" - def walk(self, top, topdown=True): + def walk(self, top, topdown=True, dvcignore=None): """Directory tree generator. See `os.walk` for the docs. Differences: @@ -60,7 +60,7 @@ def isfile(self, path): """Test whether a path is a regular file""" return os.path.isfile(path) - def walk(self, top, topdown=True): + def walk(self, top, topdown=True, dvcignore=None): """Directory tree generator. See `os.walk` for the docs. Differences: @@ -68,10 +68,18 @@ def walk(self, top, topdown=True): - it could raise exceptions, there is no onerror argument """ + if not dvcignore: + from dvc.ignore import DvcIgnoreFilter + + dvcignore = DvcIgnoreFilter(self) + def onerror(e): raise e for root, dirs, files in dvc_walk( - os.path.abspath(top), topdown=topdown, onerror=onerror + os.path.abspath(top), + topdown=topdown, + onerror=onerror, + dvcignore=dvcignore, ): yield os.path.normpath(root), dirs, files diff --git a/dvc/state.py b/dvc/state.py index 67c097d551..211ed26832 100644 --- a/dvc/state.py +++ b/dvc/state.py @@ -32,10 +32,10 @@ def __init__(self, dvc_version, expected, actual): class StateBase(object): - def save(self, path_info, checksum, dvcignore_raises=True): + def save(self, path_info, checksum): pass - def get(self, path_info, dvcignore_raises=True): + def get(self, path_info): return None def save_link(self, path_info): @@ -342,7 +342,7 @@ def get_state_record_for_inode(self, inode): return results[0] return None - def save(self, path_info, checksum, dvcignore_raises=True): + def save(self, path_info, checksum): """Save checksum for the specified path info. Args: @@ -355,7 +355,9 @@ def save(self, path_info, checksum, dvcignore_raises=True): path = fspath_py35(path_info) assert os.path.exists(path) - actual_mtime, actual_size = get_mtime_and_size(path, dvcignore_raises) + actual_mtime, actual_size = get_mtime_and_size( + path, self.repo.dvcignore + ) actual_inode = get_inode(path) existing_record = self.get_state_record_for_inode(actual_inode) @@ -369,7 +371,7 @@ def save(self, path_info, checksum, dvcignore_raises=True): actual_inode, actual_mtime, actual_size, checksum ) - def get(self, path_info, dvcignore_raises=True): + def get(self, path_info): """Gets the checksum for the specified path info. Checksum will be retrieved from the state database if available. @@ -386,7 +388,9 @@ def get(self, path_info, dvcignore_raises=True): if not os.path.exists(path): return None - actual_mtime, actual_size = get_mtime_and_size(path, dvcignore_raises) + actual_mtime, actual_size = get_mtime_and_size( + path, self.repo.dvcignore + ) actual_inode = get_inode(path) existing_record = self.get_state_record_for_inode(actual_inode) @@ -413,7 +417,7 @@ def save_link(self, path_info): if not os.path.exists(path): return - mtime, _ = get_mtime_and_size(path) + mtime, _ = get_mtime_and_size(path, self.repo.dvcignore) inode = get_inode(path) relative_path = relpath(path, self.root_dir) @@ -443,7 +447,7 @@ def remove_unused_links(self, used): continue actual_inode = get_inode(path) - actual_mtime, _ = get_mtime_and_size(path) + actual_mtime, _ = get_mtime_and_size(path, self.repo.dvcignore) if inode == actual_inode and mtime == actual_mtime: logger.debug("Removing '{}' as unused link.".format(path)) diff --git a/dvc/utils/__init__.py b/dvc/utils/__init__.py index 778e091a82..c66b6d3ca6 100644 --- a/dvc/utils/__init__.py +++ b/dvc/utils/__init__.py @@ -273,38 +273,24 @@ def to_yaml_string(data): def dvc_walk( - top, - topdown=True, - onerror=None, - followlinks=False, - raise_on_dvcignore_below_top=False, + top, topdown=True, onerror=None, followlinks=False, dvcignore=None ): """ Proxy for `os.walk` directory tree generator. Utilizes DvcIgnoreFilter functionality. """ - ignore_filter = None - if topdown: - from dvc.ignore import DvcIgnoreFilter - - ignore_filter = DvcIgnoreFilter( - top, raise_on_dvcignore_below_top=raise_on_dvcignore_below_top - ) - for root, dirs, files in os.walk( top, topdown=topdown, onerror=onerror, followlinks=followlinks ): - if ignore_filter: - dirs[:], files[:] = ignore_filter(root, dirs, files) + if dvcignore: + dirs[:], files[:] = dvcignore(root, dirs, files) yield root, dirs, files -def walk_files(directory, raise_on_dvcignore_below_top=False): - for root, _, files in dvc_walk( - directory, raise_on_dvcignore_below_top=raise_on_dvcignore_below_top - ): +def walk_files(directory, dvcignore=None): + for root, _, files in dvc_walk(directory, dvcignore=dvcignore): for f in files: yield os.path.join(root, f) diff --git a/dvc/utils/fs.py b/dvc/utils/fs.py index f2e7a3e664..001c52328e 100644 --- a/dvc/utils/fs.py +++ b/dvc/utils/fs.py @@ -20,13 +20,11 @@ def get_inode(path): return inode -def get_mtime_and_size(path, dvcignore_raises=True): +def get_mtime_and_size(path, dvcignore=None): if os.path.isdir(path): size = 0 files_mtimes = {} - for file_path in walk_files( - path, raise_on_dvcignore_below_top=dvcignore_raises - ): + for file_path in walk_files(path, dvcignore): try: stat = os.stat(file_path) except OSError as exc: diff --git a/tests/func/test_ignore.py b/tests/func/test_ignore.py index cf7c2e0bb9..591402214a 100644 --- a/tests/func/test_ignore.py +++ b/tests/func/test_ignore.py @@ -1,10 +1,9 @@ import os import shutil -import pytest -from dvc.exceptions import DvcIgnoreError from dvc.ignore import DvcIgnore +from dvc.repo import Repo from dvc.utils.compat import cast_bytes from dvc.utils.fs import get_mtime_and_size from tests.basic_env import TestDvc @@ -18,7 +17,9 @@ def setUp(self): def _get_all_paths(self): paths = [] - for root, dirs, files in self.dvc.tree.walk(self.dvc.root_dir): + for root, dirs, files in self.dvc.tree.walk( + self.dvc.root_dir, dvcignore=self.dvc.dvcignore + ): for dname in dirs: paths.append(os.path.join(root, dname)) @@ -27,10 +28,14 @@ def _get_all_paths(self): return paths + def _reload_dvc(self): + self.dvc = Repo(self.dvc.root_dir) + def test_ignore_in_child_dir(self): ignore_file = os.path.join(self.dvc.root_dir, DvcIgnore.DVCIGNORE_FILE) with open(ignore_file, "w") as fobj: fobj.write("data_dir/data") + self._reload_dvc() forbidden_path = os.path.join(self.dvc.root_dir, self.DATA) all_paths = self._get_all_paths() @@ -41,6 +46,7 @@ def test_ignore_in_child_dir_unicode(self): ignore_file = os.path.join(self.dvc.root_dir, DvcIgnore.DVCIGNORE_FILE) with open(ignore_file, "wb") as fobj: fobj.write(cast_bytes(self.UNICODE, "utf-8")) + self._reload_dvc() forbidden_path = os.path.join(self.dvc.root_dir, self.UNICODE) all_paths = self._get_all_paths() @@ -51,6 +57,7 @@ def test_ignore_in_parent_dir(self): ignore_file = os.path.join(self.dvc.root_dir, DvcIgnore.DVCIGNORE_FILE) with open(ignore_file, "w") as fobj: fobj.write("data_dir/data") + self._reload_dvc() os.chdir(self.DATA_DIR) @@ -70,12 +77,15 @@ def test_metadata_unchanged_when_moving_ignored_file(dvc_repo, repo_dir): [to_posixpath(repo_dir.DATA_SUB), to_posixpath(new_data_path)] ), ) + dvc_repo = Repo(dvc_repo.root_dir) - mtime_sig, size = get_mtime_and_size(repo_dir.DATA_DIR) + mtime_sig, size = get_mtime_and_size(repo_dir.DATA_DIR, dvc_repo.dvcignore) shutil.move(repo_dir.DATA_SUB, new_data_path) - new_mtime_sig, new_size = get_mtime_and_size(repo_dir.DATA_DIR) + new_mtime_sig, new_size = get_mtime_and_size( + repo_dir.DATA_DIR, dvc_repo.dvcignore + ) assert new_mtime_sig == mtime_sig assert new_size == size @@ -95,48 +105,64 @@ def test_mtime_changed_when_moving_non_ignored_file(dvc_repo, repo_dir): def test_metadata_unchanged_on_ignored_file_deletion(dvc_repo, repo_dir): ignore_file = os.path.join(dvc_repo.root_dir, DvcIgnore.DVCIGNORE_FILE) repo_dir.create(ignore_file, to_posixpath(repo_dir.DATA_SUB)) - mtime_sig, size = get_mtime_and_size(repo_dir.DATA_DIR) + dvc_repo = Repo(dvc_repo.root_dir) + mtime, size = get_mtime_and_size(repo_dir.DATA_DIR, dvc_repo.dvcignore) os.remove(repo_dir.DATA_SUB) - new_mtime_sig, new_size = get_mtime_and_size(repo_dir.DATA_DIR) + new_mtime, new_size = get_mtime_and_size( + repo_dir.DATA_DIR, dvc_repo.dvcignore + ) - assert new_mtime_sig == mtime_sig + assert new_mtime == mtime assert new_size == size def test_metadata_changed_on_non_ignored_file_deletion(dvc_repo, repo_dir): - mtime_sig, size = get_mtime_and_size(repo_dir.DATA_DIR) + mtime, size = get_mtime_and_size(repo_dir.DATA_DIR) os.remove(repo_dir.DATA_SUB) new_mtime_sig, new_size = get_mtime_and_size(repo_dir.DATA_DIR) - assert new_mtime_sig != mtime_sig + assert new_mtime_sig != mtime assert new_size != size -def test_ignore_should_raise_for_external_dependency(dvc_repo, repo_dir): +def test_should_ignore_for_external_dependency(dvc_repo, repo_dir): external_data_dir = repo_dir.mkdtemp() - data = os.path.join(external_data_dir, "data") + data = os.path.join(external_data_dir, "data_external") ignored_file = "data_ignored" - data_ignored = os.path.join(external_data_dir, ignored_file) - ignore_file = os.path.join(external_data_dir, DvcIgnore.DVCIGNORE_FILE) - repo_dir.create(data, "data") - repo_dir.create(data_ignored, "ignore this") - repo_dir.create(ignore_file, ignored_file) + ignored_full_path = os.path.join(external_data_dir, ignored_file) + ignore_file = os.path.join(dvc_repo.root_dir, DvcIgnore.DVCIGNORE_FILE) + repo_dir.create(data, "external_data_content") + repo_dir.create(ignored_full_path, "ignored_file_content") + repo_dir.create(ignore_file, ignored_full_path) + dvc_repo = Repo(dvc_repo.root_dir) - with pytest.raises(DvcIgnoreError): - dvc_repo.add(external_data_dir) + stages = dvc_repo.add(external_data_dir) + assert len(stages) == 1 + outs = stages[0].outs + assert len(outs) == 1 + + out_dir_cache = outs[0].dir_cache + assert len(out_dir_cache) == 1 + + assert out_dir_cache[0]["relpath"] == "data_external" + + +# TODO ignore tests rely heavily on reloading, maybe we should do something +# about that +# TODO problem: what if dvcignore is below out, someone loads repo, then adds +# dir? +def test_should_not_ignore_on_output_below_out_dir(dvc_repo, repo_dir): + dvc_repo.add(repo_dir.DATA_DIR) -def test_ignore_should_raise_on_ignore_file_below_output_dir( - dvc_repo, repo_dir -): ignore_file = os.path.join(repo_dir.DATA_DIR, DvcIgnore.DVCIGNORE_FILE) ignore_file_content = os.path.basename(repo_dir.DATA) repo_dir.create(ignore_file, ignore_file_content) + dvc_repo = Repo(dvc_repo.root_dir) - with pytest.raises(DvcIgnoreError): - dvc_repo.add(repo_dir.DATA_DIR) + assert dvc_repo.status() == {} def test_ignore_should_not_raise_on_ignore_in_dependency_dir( diff --git a/tests/func/test_repo.py b/tests/func/test_repo.py index abd8ed3805..fdf36bd2a3 100644 --- a/tests/func/test_repo.py +++ b/tests/func/test_repo.py @@ -1,4 +1,5 @@ from dvc.main import main +from dvc.repo import Repo from dvc.stage import Stage from tests.basic_env import TestDvcGit @@ -60,6 +61,8 @@ def test_should_not_gather_stage_files_from_ignored_d(self): with open(".dvcignore", "w") as fobj: fobj.write("data_dir") + self.dvc = Repo(self.dvc.root_dir) + stages = self.dvc.stages() self.assertEqual(2, len(stages)) diff --git a/tests/unit/test_ignore.py b/tests/unit/test_ignore.py index c14bd3255d..a4db03d8ed 100644 --- a/tests/unit/test_ignore.py +++ b/tests/unit/test_ignore.py @@ -1,8 +1,7 @@ import os +import mock import pytest -from pathspec import PathSpec -from pathspec.patterns import GitWildMatchPattern from dvc.ignore import DvcIgnoreFromFile, DvcIgnoreDir, DvcIgnoreFile from mock import MagicMock @@ -10,10 +9,10 @@ def mock_dvcignore(dvcignore_path, patterns): tree_mock = MagicMock() - ignore_file = DvcIgnoreFromFile(dvcignore_path, tree_mock) - ignore_file.ignore_spec = PathSpec.from_lines( - GitWildMatchPattern, patterns - ) + with mock.patch.object( + tree_mock, "open", mock.mock_open(read_data="\n".join(patterns)) + ): + ignore_file = DvcIgnoreFromFile(dvcignore_path, tree_mock) return ignore_file @@ -58,7 +57,7 @@ def test_ignore_from_file_should_filter_dirs_and_files(): ["to_ignore"], True, ), - ("to_ignore.txt", ["/*.txt"], True), + ("to_ignore.txt", ["*.txt"], True), ( os.path.join("rel", "path", "path2", "to_ignore"), ["rel/*/to_ignore"], From 27141c183704e4744cb86c03389105123c269629 Mon Sep 17 00:00:00 2001 From: pared Date: Wed, 3 Jul 2019 17:43:49 +0200 Subject: [PATCH 08/14] ignore: work in progress --- dvc/ignore.py | 92 +++++++++++++++++++-------------------- dvc/remote/base.py | 1 + dvc/repo/__init__.py | 44 +++++++++++-------- dvc/scm/git/tree.py | 12 ++--- dvc/scm/tree.py | 3 +- dvc/stage.py | 2 - tests/func/test_add.py | 4 +- tests/func/test_ignore.py | 15 ++++--- tests/unit/test_ignore.py | 80 ++++++++++++++++------------------ 9 files changed, 128 insertions(+), 125 deletions(-) diff --git a/dvc/ignore.py b/dvc/ignore.py index d2caa268ad..bf4002082b 100644 --- a/dvc/ignore.py +++ b/dvc/ignore.py @@ -5,7 +5,6 @@ from pathspec import PathSpec from pathspec.patterns import GitWildMatchPattern - logger = logging.getLogger(__name__) @@ -16,33 +15,39 @@ def __call__(self, root, dirs, files): raise NotImplementedError -class DvcIgnoreFromFile(DvcIgnore): - def __init__(self, ignore_file_path, tree): +def full_path_pattern(pattern, path): + # NOTE: '/' is translated to ^ for .gitignore style patterns, + # it is natural to proceed absolute path with this sign + if pattern.startswith("//"): + return pattern + + negation = False + + if pattern.startswith("!"): + pattern = pattern[1:] + negation = True + + if pattern.startswith("/"): + pattern = os.path.normpath(path) + pattern + else: + pattern = os.path.join(path, pattern) + + pattern = os.path.join(path, pattern) + pattern = "/" + pattern + + if negation: + pattern = "!" + pattern + + return pattern + + +class DvcIgnorePatterns(DvcIgnore): + def __init__(self, ignore_file_path, patterns): self.ignore_file_path = ignore_file_path self.dirname = os.path.normpath(os.path.dirname(ignore_file_path)) - # TODO maybe we should readlines, join with ignore_file_path.dirname - # and then parse? - with tree.open(ignore_file_path) as fobj: - ignore_lines = fobj.readlines() - - def full_path_pattern(pattern): - negation = False - if not pattern.startswith("/"): - if pattern.startswith("!"): - pattern = pattern[1:] - negation = True - pattern = os.path.join(self.dirname, pattern) - # TODO NOTE: need to escape beggining of path - pattern = "/" + pattern - if negation: - pattern = "!" + pattern - return pattern - - ignore_lines = [full_path_pattern(p) for p in ignore_lines] - self.ignore_spec = PathSpec.from_lines( - GitWildMatchPattern, ignore_lines - ) + patterns = [full_path_pattern(p, self.dirname) for p in patterns] + self.spec = PathSpec.from_lines(GitWildMatchPattern, patterns) def __call__(self, root, dirs, files): files = [f for f in files if not self.matches(root, f)] @@ -51,28 +56,30 @@ def __call__(self, root, dirs, files): return dirs, files def matches(self, dirname, basename): - abs_path = os.path.abspath(os.path.join(dirname, basename)) - # relative_path = relpath(abs_path, self.dirname) - result = self.ignore_spec.match_file(abs_path) + abs_path = os.path.join(dirname, basename) + + result = self.spec.match_file(abs_path) + return result def __hash__(self): return hash(self.ignore_file_path) -class DvcIgnoreConstant(DvcIgnore): - def __init__(self, basename): - self.basename = basename - +class DvcIgnoreDirs(DvcIgnore): + def __init__(self, basenames): + self.basenames = set(basenames) -class DvcIgnoreDir(DvcIgnoreConstant): def __call__(self, root, dirs, files): - dirs = [d for d in dirs if not d == self.basename] + dirs = [d for d in dirs if d not in self.basenames] return dirs, files -class DvcIgnoreFile(DvcIgnoreConstant): +class DvcIgnoreFile(DvcIgnore): + def __init__(self, basename): + self.basename = basename + def __call__(self, root, dirs, files): files = [f for f in files if not f == self.basename] @@ -80,21 +87,14 @@ def __call__(self, root, dirs, files): class DvcIgnoreFilter(object): - def __init__(self, tree): + def __init__(self): self.ignores = [ - DvcIgnoreDir(".git"), - DvcIgnoreDir(".hg"), - DvcIgnoreDir(".dvc"), + DvcIgnoreDirs([".git", ".hg", ".dvc"]), DvcIgnoreFile(".dvcignore"), ] - self.tree = tree - - def update(self, wdir): - ignore_file_path = os.path.join(wdir, DvcIgnore.DVCIGNORE_FILE) - if self.tree.exists(ignore_file_path): - file_ignore = DvcIgnoreFromFile(ignore_file_path, tree=self.tree) - self.ignores.append(file_ignore) + def update(self, ignore_file_path, patterns): + self.ignores.append(DvcIgnorePatterns(ignore_file_path, patterns)) def __call__(self, root, dirs, files): for ignore in self.ignores: diff --git a/dvc/remote/base.py b/dvc/remote/base.py index d89d37c509..df4513dcb5 100644 --- a/dvc/remote/base.py +++ b/dvc/remote/base.py @@ -149,6 +149,7 @@ def _collect_dir(self, path_info): root_info = path_info / root + for fname in files: file_info = root_info / fname relative_path = file_info.relative_to(path_info) diff --git a/dvc/repo/__init__.py b/dvc/repo/__init__.py index 8817220a61..bfb848d648 100644 --- a/dvc/repo/__init__.py +++ b/dvc/repo/__init__.py @@ -12,7 +12,7 @@ TargetNotDirectoryError, OutputFileMissingError, ) -from dvc.ignore import DvcIgnoreFilter +from dvc.ignore import DvcIgnoreFilter, DvcIgnore from dvc.path_info import PathInfo from dvc.utils.compat import open as _open, fspath_py35 from dvc.utils import relpath @@ -369,32 +369,44 @@ def filter_dirs(dname): return filter_dirs + def update_dvcignore(self, dvcignore, dir_path): + ignore_file_path = os.path.join(dir_path, DvcIgnore.DVCIGNORE_FILE) + if os.path.exists(ignore_file_path): + with self.tree.open(ignore_file_path) as fobj: + ignore_patterns = fobj.readlines() + dvcignore.update(ignore_file_path, ignore_patterns) + def load_dvcignores(self): from dvc.stage import Stage outs = [] # TODO maybe dvcignore should be a part of WorkingTree? - dvcignore = DvcIgnoreFilter(self.tree) + dvcignore = DvcIgnoreFilter() for root, dirs, files in self.tree.walk(self.root_dir): for fname in files: path = os.path.join(root, fname) if not Stage.is_valid_filename(path): continue - try: - stage = Stage.load(self, path) - - for out in stage.outs: - if out.scheme == "local": - outs.append(out.fspath + out.sep) + _, outs = self.load_stage_with_outs_paths(path) + outs.extend(outs) # TODO - except Exception: - pass out_dir_filter = self.make_out_dir_filter(outs, root) dirs[:] = list(filter(out_dir_filter, dirs)) - dvcignore.update(root) + self.update_dvcignore(dvcignore, root) return dvcignore + # TODO I don't like it + def load_stage_with_outs_paths(self, stage_path): + from dvc.stage import Stage + + outs = [] + stage = Stage.load(self, stage_path) + for out in stage.outs: + if out.scheme == "local": + outs.append(out.fspath + out.sep) + return stage, outs + def stages(self, from_directory=None, check_dag=True): """ Walks down the root directory looking for Dvcfiles, @@ -413,7 +425,7 @@ def stages(self, from_directory=None, check_dag=True): raise TargetNotDirectoryError(from_directory) stages = [] - outs = [] + all_outs = [] for root, dirs, files in self.tree.walk( from_directory, dvcignore=self.dvcignore @@ -422,13 +434,11 @@ def stages(self, from_directory=None, check_dag=True): path = os.path.join(root, fname) if not Stage.is_valid_filename(path): continue - stage = Stage.load(self, path) - for out in stage.outs: - if out.scheme == "local": - outs.append(out.fspath + out.sep) + stage, outs = self.load_stage_with_outs_paths(path) stages.append(stage) + all_outs.extend(outs) - out_dir_filter = self.make_out_dir_filter(outs, root) + out_dir_filter = self.make_out_dir_filter(all_outs, root) dirs[:] = list(filter(out_dir_filter, dirs)) if check_dag: diff --git a/dvc/scm/git/tree.py b/dvc/scm/git/tree.py index ef8d7a8841..39c1dbf3b1 100644 --- a/dvc/scm/git/tree.py +++ b/dvc/scm/git/tree.py @@ -102,7 +102,7 @@ def git_object_by_path(self, path): tree = tree[i] return tree - def _walk(self, tree, topdown=True, dvc_ignore_filter=None): + def _walk(self, tree, topdown=True): # TODO how to handle dvcignore here? dirs, nondirs = [], [] for i in tree: @@ -112,23 +112,17 @@ def _walk(self, tree, topdown=True, dvc_ignore_filter=None): nondirs.append(i.name) if topdown: - if not dvc_ignore_filter: - # dvc_ignore_filter = DvcIgnoreFilter(tree.abspath) - pass - if dvc_ignore_filter: - dirs, nondirs = dvc_ignore_filter(tree.path, dirs, nondirs) yield os.path.normpath(tree.abspath), dirs, nondirs for i in dirs: - for x in self._walk( - tree[i], topdown=True, dvc_ignore_filter=dvc_ignore_filter - ): + for x in self._walk(tree[i], topdown=True): yield x if not topdown: yield os.path.normpath(tree.abspath), dirs, nondirs def walk(self, top, topdown=True, dvcignore=None): + # TODO should we ignore dvcignore for GitTree? """Directory tree generator. See `os.walk` for the docs. Differences: diff --git a/dvc/scm/tree.py b/dvc/scm/tree.py index 9d18b5e738..0a67196119 100644 --- a/dvc/scm/tree.py +++ b/dvc/scm/tree.py @@ -71,7 +71,8 @@ def walk(self, top, topdown=True, dvcignore=None): if not dvcignore: from dvc.ignore import DvcIgnoreFilter - dvcignore = DvcIgnoreFilter(self) + # TODO should it be here? + dvcignore = DvcIgnoreFilter() def onerror(e): raise e diff --git a/dvc/stage.py b/dvc/stage.py index 376bfd37a9..9ca9256e63 100644 --- a/dvc/stage.py +++ b/dvc/stage.py @@ -401,7 +401,6 @@ def is_cached(self): # NOTE: need to save checksums for deps in order to compare them # with what is written in the old stage. - # TODO this should use dvcignore for dep in self.deps: dep.save() @@ -707,7 +706,6 @@ def _compute_md5(self): return m def save(self): - # TODO should we use dvcignore here? for dep in self.deps: dep.save() diff --git a/tests/func/test_add.py b/tests/func/test_add.py index 2c1aa29273..c5012dab44 100644 --- a/tests/func/test_add.py +++ b/tests/func/test_add.py @@ -431,7 +431,7 @@ def test(self): self._caplog.clear() ret = main(["add", self.BAR]) - assert 1 == ret + assert 255 == ret expected_error = ( "unable to read DVC-file: {} " @@ -481,7 +481,7 @@ def test(self): file.write("this will break yaml file structure") ret = main(["add", self.BAR]) - self.assertEqual(1, ret) + self.assertEqual(255, ret) bar_stage_file = self.BAR + Stage.STAGE_FILE_SUFFIX self.assertFalse(os.path.exists(bar_stage_file)) diff --git a/tests/func/test_ignore.py b/tests/func/test_ignore.py index 591402214a..8de0376e76 100644 --- a/tests/func/test_ignore.py +++ b/tests/func/test_ignore.py @@ -79,12 +79,14 @@ def test_metadata_unchanged_when_moving_ignored_file(dvc_repo, repo_dir): ) dvc_repo = Repo(dvc_repo.root_dir) - mtime_sig, size = get_mtime_and_size(repo_dir.DATA_DIR, dvc_repo.dvcignore) + mtime_sig, size = get_mtime_and_size( + os.path.abspath(repo_dir.DATA_DIR), dvc_repo.dvcignore + ) shutil.move(repo_dir.DATA_SUB, new_data_path) new_mtime_sig, new_size = get_mtime_and_size( - repo_dir.DATA_DIR, dvc_repo.dvcignore + os.path.abspath(repo_dir.DATA_DIR), dvc_repo.dvcignore ) assert new_mtime_sig == mtime_sig @@ -106,11 +108,14 @@ def test_metadata_unchanged_on_ignored_file_deletion(dvc_repo, repo_dir): ignore_file = os.path.join(dvc_repo.root_dir, DvcIgnore.DVCIGNORE_FILE) repo_dir.create(ignore_file, to_posixpath(repo_dir.DATA_SUB)) dvc_repo = Repo(dvc_repo.root_dir) - mtime, size = get_mtime_and_size(repo_dir.DATA_DIR, dvc_repo.dvcignore) + mtime, size = get_mtime_and_size( + os.path.abspath(repo_dir.DATA_DIR), dvc_repo.dvcignore + ) os.remove(repo_dir.DATA_SUB) + # TODO abspath new_mtime, new_size = get_mtime_and_size( - repo_dir.DATA_DIR, dvc_repo.dvcignore + os.path.abspath(repo_dir.DATA_DIR), dvc_repo.dvcignore ) assert new_mtime == mtime @@ -135,7 +140,7 @@ def test_should_ignore_for_external_dependency(dvc_repo, repo_dir): ignore_file = os.path.join(dvc_repo.root_dir, DvcIgnore.DVCIGNORE_FILE) repo_dir.create(data, "external_data_content") repo_dir.create(ignored_full_path, "ignored_file_content") - repo_dir.create(ignore_file, ignored_full_path) + repo_dir.create(ignore_file, "/" + ignored_full_path) dvc_repo = Repo(dvc_repo.root_dir) stages = dvc_repo.add(external_data_dir) diff --git a/tests/unit/test_ignore.py b/tests/unit/test_ignore.py index a4db03d8ed..00f5835a3e 100644 --- a/tests/unit/test_ignore.py +++ b/tests/unit/test_ignore.py @@ -1,18 +1,12 @@ import os -import mock import pytest -from dvc.ignore import DvcIgnoreFromFile, DvcIgnoreDir, DvcIgnoreFile -from mock import MagicMock +from dvc.ignore import DvcIgnorePatterns, DvcIgnoreDirs, DvcIgnoreFile def mock_dvcignore(dvcignore_path, patterns): - tree_mock = MagicMock() - with mock.patch.object( - tree_mock, "open", mock.mock_open(read_data="\n".join(patterns)) - ): - ignore_file = DvcIgnoreFromFile(dvcignore_path, tree_mock) + ignore_file = DvcIgnorePatterns(dvcignore_path, patterns) return ignore_file @@ -37,40 +31,40 @@ def test_ignore_from_file_should_filter_dirs_and_files(): @pytest.mark.parametrize( "file_to_ignore_relpath, patterns, expected_match", [ - ("to_ignore", ["to_ignore"], True), - ("to_ignore.txt", ["to_ignore*"], True), - ( - os.path.join("rel", "p", "p2", "to_ignore"), - ["rel/**/to_ignore"], - True, - ), - ( - os.path.join( - os.path.sep, - "full", - "path", - "to", - "ignore", - "file", - "to_ignore", - ), - ["to_ignore"], - True, - ), - ("to_ignore.txt", ["*.txt"], True), - ( - os.path.join("rel", "path", "path2", "to_ignore"), - ["rel/*/to_ignore"], - False, - ), - (os.path.join("path", "to_ignore.txt"), ["/*.txt"], False), - ( - os.path.join("rel", "path", "path2", "dont_ignore"), - ["rel/**/to_ignore"], - False, - ), - ("dont_ignore.txt", ["dont_ignore"], False), - ("dont_ignore.txt", ["dont*", "!dont_ignore.txt"], False), + # ("to_ignore", ["to_ignore"], True), + # ("to_ignore.txt", ["to_ignore*"], True), + # ( + # os.path.join("rel", "p", "p2", "to_ignore"), + # ["rel/**/to_ignore"], + # True, + # ), + # ( + # os.path.join( + # os.path.sep, + # "full", + # "path", + # "to", + # "ignore", + # "file", + # "to_ignore", + # ), + # ["to_ignore"], + # True, + # ), + ("to_ignore.txt", ["/*.txt"], True), + # ( + # os.path.join("rel", "path", "path2", "to_ignore"), + # ["rel/*/to_ignore"], + # False, + # ), + # (os.path.join("path", "to_ignore.txt"), ["/*.txt"], False), + # ( + # os.path.join("rel", "path", "path2", "dont_ignore"), + # ["rel/**/to_ignore"], + # False, + # ), + # ("dont_ignore.txt", ["dont_ignore"], False), + # ("dont_ignore.txt", ["dont*", "!dont_ignore.txt"], False), ], ) def test_match_ignore_from_file( @@ -92,7 +86,7 @@ def test_match_ignore_from_file( @pytest.mark.parametrize("omit_dir", [".git", ".hg", ".dvc"]) def test_should_ignore_dir(omit_dir): - ignore = DvcIgnoreDir(omit_dir) + ignore = DvcIgnoreDirs([".git", ".hg", ".dvc"]) root = os.path.join(os.path.sep, "walk", "dir", "root") dirs = [omit_dir, "dir1", "dir2"] From d922f08cb544000ef72e54386d876aea68fbb3ef Mon Sep 17 00:00:00 2001 From: pared Date: Thu, 4 Jul 2019 11:58:39 +0200 Subject: [PATCH 09/14] ignore: work in progress 2 --- dvc/ignore.py | 70 +++++++++++++++++++++++---------------- tests/func/test_ignore.py | 33 ++++++++++-------- tests/unit/test_ignore.py | 67 +++++++++++++++++++------------------ 3 files changed, 95 insertions(+), 75 deletions(-) diff --git a/dvc/ignore.py b/dvc/ignore.py index bf4002082b..2ccdd446eb 100644 --- a/dvc/ignore.py +++ b/dvc/ignore.py @@ -5,6 +5,8 @@ from pathspec import PathSpec from pathspec.patterns import GitWildMatchPattern +from dvc.utils import relpath + logger = logging.getLogger(__name__) @@ -15,30 +17,30 @@ def __call__(self, root, dirs, files): raise NotImplementedError -def full_path_pattern(pattern, path): - # NOTE: '/' is translated to ^ for .gitignore style patterns, - # it is natural to proceed absolute path with this sign - if pattern.startswith("//"): - return pattern - - negation = False - - if pattern.startswith("!"): - pattern = pattern[1:] - negation = True - - if pattern.startswith("/"): - pattern = os.path.normpath(path) + pattern - else: - pattern = os.path.join(path, pattern) - - pattern = os.path.join(path, pattern) - pattern = "/" + pattern - - if negation: - pattern = "!" + pattern - - return pattern +# def full_path_pattern(pattern, path): +# # NOTE: '/' is translated to ^ for .gitignore style patterns, +# # it is natural to proceed absolute path with this sign +# if pattern.startswith("//"): +# return pattern +# +# negation = False +# +# if pattern.startswith("!"): +# pattern = pattern[1:] +# negation = True +# +# if pattern.startswith("/"): +# pattern = os.path.normpath(path) + pattern +# else: +# pattern = os.path.join(path, pattern) +# +# pattern = os.path.join(path, pattern) +# pattern = "/" + pattern +# +# if negation: +# pattern = "!" + pattern +# +# return pattern class DvcIgnorePatterns(DvcIgnore): @@ -46,8 +48,17 @@ def __init__(self, ignore_file_path, patterns): self.ignore_file_path = ignore_file_path self.dirname = os.path.normpath(os.path.dirname(ignore_file_path)) - patterns = [full_path_pattern(p, self.dirname) for p in patterns] - self.spec = PathSpec.from_lines(GitWildMatchPattern, patterns) + abs_patterns = [] + rel_patterns = [] + for p in patterns: + # NOTE: '/' is translated to ^ for .gitignore style patterns, + # it is natural to proceed absolute path with this sign + if p[0] == "/" and os.path.isabs(p[1:]): + abs_patterns.append(p) + else: + rel_patterns.append(p) + self.abs_spec = PathSpec.from_lines(GitWildMatchPattern, abs_patterns) + self.rel_spec = PathSpec.from_lines(GitWildMatchPattern, rel_patterns) def __call__(self, root, dirs, files): files = [f for f in files if not self.matches(root, f)] @@ -57,10 +68,11 @@ def __call__(self, root, dirs, files): def matches(self, dirname, basename): abs_path = os.path.join(dirname, basename) + rel_path = relpath(abs_path, self.dirname) - result = self.spec.match_file(abs_path) - - return result + if os.pardir + os.sep in rel_path or os.path.isabs(rel_path): + return self.abs_spec.match_file(abs_path) + return self.rel_spec.match_file(rel_path) def __hash__(self): return hash(self.ignore_file_path) diff --git a/tests/func/test_ignore.py b/tests/func/test_ignore.py index 8de0376e76..dbf78acd19 100644 --- a/tests/func/test_ignore.py +++ b/tests/func/test_ignore.py @@ -79,14 +79,12 @@ def test_metadata_unchanged_when_moving_ignored_file(dvc_repo, repo_dir): ) dvc_repo = Repo(dvc_repo.root_dir) - mtime_sig, size = get_mtime_and_size( - os.path.abspath(repo_dir.DATA_DIR), dvc_repo.dvcignore - ) + mtime_sig, size = get_mtime_and_size(repo_dir.DATA_DIR, dvc_repo.dvcignore) shutil.move(repo_dir.DATA_SUB, new_data_path) new_mtime_sig, new_size = get_mtime_and_size( - os.path.abspath(repo_dir.DATA_DIR), dvc_repo.dvcignore + repo_dir.DATA_DIR, dvc_repo.dvcignore ) assert new_mtime_sig == mtime_sig @@ -108,14 +106,11 @@ def test_metadata_unchanged_on_ignored_file_deletion(dvc_repo, repo_dir): ignore_file = os.path.join(dvc_repo.root_dir, DvcIgnore.DVCIGNORE_FILE) repo_dir.create(ignore_file, to_posixpath(repo_dir.DATA_SUB)) dvc_repo = Repo(dvc_repo.root_dir) - mtime, size = get_mtime_and_size( - os.path.abspath(repo_dir.DATA_DIR), dvc_repo.dvcignore - ) + mtime, size = get_mtime_and_size(repo_dir.DATA_DIR, dvc_repo.dvcignore) os.remove(repo_dir.DATA_SUB) - # TODO abspath new_mtime, new_size = get_mtime_and_size( - os.path.abspath(repo_dir.DATA_DIR), dvc_repo.dvcignore + repo_dir.DATA_DIR, dvc_repo.dvcignore ) assert new_mtime == mtime @@ -155,10 +150,22 @@ def test_should_ignore_for_external_dependency(dvc_repo, repo_dir): assert out_dir_cache[0]["relpath"] == "data_external" -# TODO ignore tests rely heavily on reloading, maybe we should do something -# about that -# TODO problem: what if dvcignore is below out, someone loads repo, then adds -# dir? +def test_should_not_consider_dvcignore_after_adding(dvc_repo, repo_dir): + ignore_file = os.path.join(repo_dir.DATA_DIR, DvcIgnore.DVCIGNORE_FILE) + ignore_file_content = os.path.basename(repo_dir.DATA) + + repo_dir.create(ignore_file, ignore_file_content) + dvc_repo = Repo(dvc_repo.root_dir) + + stages = dvc_repo.add(repo_dir.DATA_DIR) + assert len(stages) == 1 + assert len(stages[0].outs) == 1 + + dir_cache = stages[0].outs[0].dir_cache + relpaths = set(map(lambda x: x["relpath"], dir_cache)) + assert repo_dir.DATA in relpaths + + def test_should_not_ignore_on_output_below_out_dir(dvc_repo, repo_dir): dvc_repo.add(repo_dir.DATA_DIR) diff --git a/tests/unit/test_ignore.py b/tests/unit/test_ignore.py index 00f5835a3e..0bdb07e028 100644 --- a/tests/unit/test_ignore.py +++ b/tests/unit/test_ignore.py @@ -31,40 +31,41 @@ def test_ignore_from_file_should_filter_dirs_and_files(): @pytest.mark.parametrize( "file_to_ignore_relpath, patterns, expected_match", [ - # ("to_ignore", ["to_ignore"], True), - # ("to_ignore.txt", ["to_ignore*"], True), - # ( - # os.path.join("rel", "p", "p2", "to_ignore"), - # ["rel/**/to_ignore"], - # True, - # ), - # ( - # os.path.join( - # os.path.sep, - # "full", - # "path", - # "to", - # "ignore", - # "file", - # "to_ignore", - # ), - # ["to_ignore"], - # True, - # ), + ("to_ignore", ["to_ignore"], True), + ("to_ignore.txt", ["to_ignore*"], True), + ( + os.path.join("rel", "p", "p2", "to_ignore"), + ["rel/**/to_ignore"], + True, + ), + ( + os.path.join( + os.path.sep, + "full", + "path", + "to", + "ignore", + "file", + "to_ignore", + ), + ["to_ignore"], + True, + ), ("to_ignore.txt", ["/*.txt"], True), - # ( - # os.path.join("rel", "path", "path2", "to_ignore"), - # ["rel/*/to_ignore"], - # False, - # ), - # (os.path.join("path", "to_ignore.txt"), ["/*.txt"], False), - # ( - # os.path.join("rel", "path", "path2", "dont_ignore"), - # ["rel/**/to_ignore"], - # False, - # ), - # ("dont_ignore.txt", ["dont_ignore"], False), - # ("dont_ignore.txt", ["dont*", "!dont_ignore.txt"], False), + ( + os.path.join("rel", "path", "path2", "to_ignore"), + ["rel/*/to_ignore"], + False, + ), + (os.path.join("path", "to_ignore.txt"), ["/*.txt"], False), + ( + os.path.join("rel", "path", "path2", "dont_ignore"), + ["rel/**/to_ignore"], + False, + ), + ("dont_ignore.txt", ["dont_ignore"], False), + ("dont_ignore.txt", ["dont*", "!dont_ignore.txt"], False), + ("../../../something.txt", ["**/something.txt"], False), ], ) def test_match_ignore_from_file( From 5797fcabd390326cab5a8ef04b071fbc1f510fc8 Mon Sep 17 00:00:00 2001 From: pared Date: Thu, 4 Jul 2019 12:13:03 +0200 Subject: [PATCH 10/14] ignore: throw on .dvcignore upon collecting dir --- dvc/exceptions.py | 8 ++++++++ dvc/ignore.py | 15 +-------------- dvc/remote/base.py | 15 +++++++++++---- tests/func/test_ignore.py | 37 ++++++------------------------------- tests/unit/test_ignore.py | 15 +-------------- 5 files changed, 27 insertions(+), 63 deletions(-) diff --git a/dvc/exceptions.py b/dvc/exceptions.py index 6b38af4ead..b47bbee7bc 100644 --- a/dvc/exceptions.py +++ b/dvc/exceptions.py @@ -270,3 +270,11 @@ def __init__(self, path): super(OutputFileMissingError, self).__init__( "Can't find {} neither locally nor on remote".format(path) ) + + +class DvcIgnoreInCollectedDirError(DvcException): + def __init(self, ignore_dirname): + super(DvcIgnoreInCollectedDirError, self).__init__( + ".dvcignore file should not be in collected dir path: '{" + "}'".format(ignore_dirname) + ) diff --git a/dvc/ignore.py b/dvc/ignore.py index 2ccdd446eb..f02ee4e7f6 100644 --- a/dvc/ignore.py +++ b/dvc/ignore.py @@ -88,22 +88,9 @@ def __call__(self, root, dirs, files): return dirs, files -class DvcIgnoreFile(DvcIgnore): - def __init__(self, basename): - self.basename = basename - - def __call__(self, root, dirs, files): - files = [f for f in files if not f == self.basename] - - return dirs, files - - class DvcIgnoreFilter(object): def __init__(self): - self.ignores = [ - DvcIgnoreDirs([".git", ".hg", ".dvc"]), - DvcIgnoreFile(".dvcignore"), - ] + self.ignores = [DvcIgnoreDirs([".git", ".hg", ".dvc"])] def update(self, ignore_file_path, patterns): self.ignores.append(DvcIgnorePatterns(ignore_file_path, patterns)) diff --git a/dvc/remote/base.py b/dvc/remote/base.py index df4513dcb5..d078e4b09f 100644 --- a/dvc/remote/base.py +++ b/dvc/remote/base.py @@ -1,5 +1,6 @@ from __future__ import unicode_literals +from dvc.ignore import DvcIgnore from dvc.utils.compat import str, basestring, urlparse, fspath_py35, makedirs import os @@ -14,7 +15,11 @@ import dvc.prompt as prompt from dvc.config import Config -from dvc.exceptions import DvcException, ConfirmRemoveError +from dvc.exceptions import ( + DvcException, + ConfirmRemoveError, + DvcIgnoreInCollectedDirError, +) from dvc.progress import progress, ProgressCallback from dvc.utils import LARGE_DIR_SIZE, tmp_fname, to_chunks, move, relpath from dvc.state import StateBase @@ -151,9 +156,11 @@ def _collect_dir(self, path_info): for fname in files: - file_info = root_info / fname - relative_path = file_info.relative_to(path_info) - checksum = executor.submit( + if fname == DvcIgnore.DVCIGNORE_FILE: + raise DvcIgnoreInCollectedDirError(root) + file_info = root_info / fname + relative_path = file_info.relative_to(path_info) + checksum = executor.submit( self.get_file_checksum, file_info ) dir_info[checksum] = { diff --git a/tests/func/test_ignore.py b/tests/func/test_ignore.py index dbf78acd19..cbf59f96ad 100644 --- a/tests/func/test_ignore.py +++ b/tests/func/test_ignore.py @@ -1,7 +1,9 @@ import os import shutil +import pytest +from dvc.exceptions import DvcIgnoreInCollectedDirError from dvc.ignore import DvcIgnore from dvc.repo import Repo from dvc.utils.compat import cast_bytes @@ -91,7 +93,7 @@ def test_metadata_unchanged_when_moving_ignored_file(dvc_repo, repo_dir): assert new_size == size -def test_mtime_changed_when_moving_non_ignored_file(dvc_repo, repo_dir): +def test_mtime_changed_when_moving_non_ignored_file(repo_dir): new_data_path = repo_dir.DATA_SUB + "_new" mtime, size = get_mtime_and_size(repo_dir.DATA_DIR) @@ -117,7 +119,7 @@ def test_metadata_unchanged_on_ignored_file_deletion(dvc_repo, repo_dir): assert new_size == size -def test_metadata_changed_on_non_ignored_file_deletion(dvc_repo, repo_dir): +def test_metadata_changed_on_non_ignored_file_deletion(repo_dir): mtime, size = get_mtime_and_size(repo_dir.DATA_DIR) os.remove(repo_dir.DATA_SUB) @@ -157,32 +159,5 @@ def test_should_not_consider_dvcignore_after_adding(dvc_repo, repo_dir): repo_dir.create(ignore_file, ignore_file_content) dvc_repo = Repo(dvc_repo.root_dir) - stages = dvc_repo.add(repo_dir.DATA_DIR) - assert len(stages) == 1 - assert len(stages[0].outs) == 1 - - dir_cache = stages[0].outs[0].dir_cache - relpaths = set(map(lambda x: x["relpath"], dir_cache)) - assert repo_dir.DATA in relpaths - - -def test_should_not_ignore_on_output_below_out_dir(dvc_repo, repo_dir): - dvc_repo.add(repo_dir.DATA_DIR) - - ignore_file = os.path.join(repo_dir.DATA_DIR, DvcIgnore.DVCIGNORE_FILE) - ignore_file_content = os.path.basename(repo_dir.DATA) - repo_dir.create(ignore_file, ignore_file_content) - dvc_repo = Repo(dvc_repo.root_dir) - - assert dvc_repo.status() == {} - - -def test_ignore_should_not_raise_on_ignore_in_dependency_dir( - dvc_repo, repo_dir -): - ignore_file = os.path.join(repo_dir.DATA_DIR, DvcIgnore.DVCIGNORE_FILE) - ignore_file_content = os.path.basename(repo_dir.DATA) - repo_dir.create(ignore_file, ignore_file_content) - - stage_file = "stage.dvc" - dvc_repo.run(fname=stage_file, deps=[repo_dir.DATA_DIR]) + with pytest.raises(DvcIgnoreInCollectedDirError): + dvc_repo.add(repo_dir.DATA_DIR) diff --git a/tests/unit/test_ignore.py b/tests/unit/test_ignore.py index 0bdb07e028..dd475b4239 100644 --- a/tests/unit/test_ignore.py +++ b/tests/unit/test_ignore.py @@ -2,7 +2,7 @@ import pytest -from dvc.ignore import DvcIgnorePatterns, DvcIgnoreDirs, DvcIgnoreFile +from dvc.ignore import DvcIgnorePatterns, DvcIgnoreDirs def mock_dvcignore(dvcignore_path, patterns): @@ -96,16 +96,3 @@ def test_should_ignore_dir(omit_dir): new_dirs, _ = ignore(root, dirs, files) assert set(new_dirs) == {"dir1", "dir2"} - - -def test_should_ignore_file(): - dvcignore = ".dvcignore" - ignore = DvcIgnoreFile(dvcignore) - - root = os.path.join(os.path.sep, "walk", "dir", "root") - dirs = [] - files = ["file1", "file2", dvcignore] - - _, new_files = ignore(root, dirs, files) - - assert set(new_files) == {"file1", "file2"} From 934477c319cc16222308343e2c6fc81cfe02c02b Mon Sep 17 00:00:00 2001 From: pawel Date: Fri, 5 Jul 2019 12:29:09 +0200 Subject: [PATCH 11/14] ignore: refactor --- dvc/ignore.py | 41 ++++++++++++++------------ dvc/remote/base.py | 15 +++++----- dvc/repo/__init__.py | 52 +++++--------------------------- dvc/scm/git/tree.py | 2 -- dvc/scm/tree.py | 2 +- dvc/utils/__init__.py | 3 ++ tests/func/test_add.py | 4 +-- tests/func/test_ignore.py | 62 +++++++++++++-------------------------- tests/unit/test_ignore.py | 9 +++++- 9 files changed, 72 insertions(+), 118 deletions(-) diff --git a/dvc/ignore.py b/dvc/ignore.py index f02ee4e7f6..53c8074541 100644 --- a/dvc/ignore.py +++ b/dvc/ignore.py @@ -6,6 +6,9 @@ from pathspec.patterns import GitWildMatchPattern from dvc.utils import relpath +from dvc.utils.fs import get_parent_dirs_up_to + +from dvc.utils.compat import open logger = logging.getLogger(__name__) @@ -44,21 +47,12 @@ def __call__(self, root, dirs, files): class DvcIgnorePatterns(DvcIgnore): - def __init__(self, ignore_file_path, patterns): + def __init__(self, ignore_file_path): self.ignore_file_path = ignore_file_path self.dirname = os.path.normpath(os.path.dirname(ignore_file_path)) - abs_patterns = [] - rel_patterns = [] - for p in patterns: - # NOTE: '/' is translated to ^ for .gitignore style patterns, - # it is natural to proceed absolute path with this sign - if p[0] == "/" and os.path.isabs(p[1:]): - abs_patterns.append(p) - else: - rel_patterns.append(p) - self.abs_spec = PathSpec.from_lines(GitWildMatchPattern, abs_patterns) - self.rel_spec = PathSpec.from_lines(GitWildMatchPattern, rel_patterns) + with open(ignore_file_path, encoding="utf-8") as fobj: + self.rel_spec = PathSpec.from_lines(GitWildMatchPattern, fobj) def __call__(self, root, dirs, files): files = [f for f in files if not self.matches(root, f)] @@ -70,8 +64,8 @@ def matches(self, dirname, basename): abs_path = os.path.join(dirname, basename) rel_path = relpath(abs_path, self.dirname) - if os.pardir + os.sep in rel_path or os.path.isabs(rel_path): - return self.abs_spec.match_file(abs_path) + if os.pardir + os.sep in rel_path: + return False return self.rel_spec.match_file(rel_path) def __hash__(self): @@ -89,11 +83,22 @@ def __call__(self, root, dirs, files): class DvcIgnoreFilter(object): - def __init__(self): + def __init__(self, root=None): self.ignores = [DvcIgnoreDirs([".git", ".hg", ".dvc"])] - - def update(self, ignore_file_path, patterns): - self.ignores.append(DvcIgnorePatterns(ignore_file_path, patterns)) + self.root = root + + def load_upper_levels(self, top): + top = os.path.abspath(top) + if self.root: + parent_dirs = get_parent_dirs_up_to(top, self.root) + parent_dirs.append(top) + for d in parent_dirs: + self.update(d) + + def update(self, dirname): + ignore_file_path = os.path.join(dirname, DvcIgnore.DVCIGNORE_FILE) + if os.path.exists(ignore_file_path): + self.ignores.append(DvcIgnorePatterns(ignore_file_path)) def __call__(self, root, dirs, files): for ignore in self.ignores: diff --git a/dvc/remote/base.py b/dvc/remote/base.py index d078e4b09f..755274eeeb 100644 --- a/dvc/remote/base.py +++ b/dvc/remote/base.py @@ -151,16 +151,15 @@ def _collect_dir(self, path_info): with ThreadPoolExecutor(max_workers=self.checksum_jobs) as executor: for root, _dirs, files in self.walk(path_info): - root_info = path_info / root - - - + root_info = path_info / root for fname in files: + if fname == DvcIgnore.DVCIGNORE_FILE: - raise DvcIgnoreInCollectedDirError(root) - file_info = root_info / fname - relative_path = file_info.relative_to(path_info) - checksum = executor.submit( + raise DvcIgnoreInCollectedDirError(root) + + file_info = root_info / fname + relative_path = file_info.relative_to(path_info) + checksum = executor.submit( self.get_file_checksum, file_info ) dir_info[checksum] = { diff --git a/dvc/repo/__init__.py b/dvc/repo/__init__.py index bfb848d648..d26c18b281 100644 --- a/dvc/repo/__init__.py +++ b/dvc/repo/__init__.py @@ -12,7 +12,7 @@ TargetNotDirectoryError, OutputFileMissingError, ) -from dvc.ignore import DvcIgnoreFilter, DvcIgnore +from dvc.ignore import DvcIgnoreFilter from dvc.path_info import PathInfo from dvc.utils.compat import open as _open, fspath_py35 from dvc.utils import relpath @@ -66,6 +66,7 @@ def __init__(self, root_dir=None): self.scm = SCM(self.root_dir, repo=self) self.tree = WorkingTree(self.root_dir) + self.dvcignore = DvcIgnoreFilter(self.root_dir) self.lock = Lock(self.dvc_dir) # NOTE: storing state and link_state in the repository itself to avoid @@ -85,7 +86,6 @@ def __init__(self, root_dir=None): self.tag = Tag(self) self._ignore() - self.dvcignore = self.load_dvcignores() def __repr__(self): return "Repo: '{root_dir}'".format(root_dir=self.root_dir) @@ -369,44 +369,6 @@ def filter_dirs(dname): return filter_dirs - def update_dvcignore(self, dvcignore, dir_path): - ignore_file_path = os.path.join(dir_path, DvcIgnore.DVCIGNORE_FILE) - if os.path.exists(ignore_file_path): - with self.tree.open(ignore_file_path) as fobj: - ignore_patterns = fobj.readlines() - dvcignore.update(ignore_file_path, ignore_patterns) - - def load_dvcignores(self): - from dvc.stage import Stage - - outs = [] - # TODO maybe dvcignore should be a part of WorkingTree? - dvcignore = DvcIgnoreFilter() - for root, dirs, files in self.tree.walk(self.root_dir): - for fname in files: - path = os.path.join(root, fname) - if not Stage.is_valid_filename(path): - continue - _, outs = self.load_stage_with_outs_paths(path) - outs.extend(outs) - # TODO - out_dir_filter = self.make_out_dir_filter(outs, root) - dirs[:] = list(filter(out_dir_filter, dirs)) - self.update_dvcignore(dvcignore, root) - - return dvcignore - - # TODO I don't like it - def load_stage_with_outs_paths(self, stage_path): - from dvc.stage import Stage - - outs = [] - stage = Stage.load(self, stage_path) - for out in stage.outs: - if out.scheme == "local": - outs.append(out.fspath + out.sep) - return stage, outs - def stages(self, from_directory=None, check_dag=True): """ Walks down the root directory looking for Dvcfiles, @@ -425,7 +387,7 @@ def stages(self, from_directory=None, check_dag=True): raise TargetNotDirectoryError(from_directory) stages = [] - all_outs = [] + outs = [] for root, dirs, files in self.tree.walk( from_directory, dvcignore=self.dvcignore @@ -434,11 +396,13 @@ def stages(self, from_directory=None, check_dag=True): path = os.path.join(root, fname) if not Stage.is_valid_filename(path): continue - stage, outs = self.load_stage_with_outs_paths(path) + stage = Stage.load(self, path) + for out in stage.outs: + if out.scheme == "local": + outs.append(out.fspath + out.sep) stages.append(stage) - all_outs.extend(outs) - out_dir_filter = self.make_out_dir_filter(all_outs, root) + out_dir_filter = self.make_out_dir_filter(outs, root) dirs[:] = list(filter(out_dir_filter, dirs)) if check_dag: diff --git a/dvc/scm/git/tree.py b/dvc/scm/git/tree.py index 39c1dbf3b1..5716984d2e 100644 --- a/dvc/scm/git/tree.py +++ b/dvc/scm/git/tree.py @@ -103,7 +103,6 @@ def git_object_by_path(self, path): return tree def _walk(self, tree, topdown=True): - # TODO how to handle dvcignore here? dirs, nondirs = [], [] for i in tree: if i.mode == GIT_MODE_DIR: @@ -122,7 +121,6 @@ def _walk(self, tree, topdown=True): yield os.path.normpath(tree.abspath), dirs, nondirs def walk(self, top, topdown=True, dvcignore=None): - # TODO should we ignore dvcignore for GitTree? """Directory tree generator. See `os.walk` for the docs. Differences: diff --git a/dvc/scm/tree.py b/dvc/scm/tree.py index 0a67196119..090e8746a3 100644 --- a/dvc/scm/tree.py +++ b/dvc/scm/tree.py @@ -71,8 +71,8 @@ def walk(self, top, topdown=True, dvcignore=None): if not dvcignore: from dvc.ignore import DvcIgnoreFilter - # TODO should it be here? dvcignore = DvcIgnoreFilter() + dvcignore.load_upper_levels(top) def onerror(e): raise e diff --git a/dvc/utils/__init__.py b/dvc/utils/__init__.py index c66b6d3ca6..824689049f 100644 --- a/dvc/utils/__init__.py +++ b/dvc/utils/__init__.py @@ -279,11 +279,14 @@ def dvc_walk( Proxy for `os.walk` directory tree generator. Utilizes DvcIgnoreFilter functionality. """ + if dvcignore: + dvcignore.load_upper_levels(top) for root, dirs, files in os.walk( top, topdown=topdown, onerror=onerror, followlinks=followlinks ): if dvcignore: + dvcignore.update(root) dirs[:], files[:] = dvcignore(root, dirs, files) yield root, dirs, files diff --git a/tests/func/test_add.py b/tests/func/test_add.py index c5012dab44..2c1aa29273 100644 --- a/tests/func/test_add.py +++ b/tests/func/test_add.py @@ -431,7 +431,7 @@ def test(self): self._caplog.clear() ret = main(["add", self.BAR]) - assert 255 == ret + assert 1 == ret expected_error = ( "unable to read DVC-file: {} " @@ -481,7 +481,7 @@ def test(self): file.write("this will break yaml file structure") ret = main(["add", self.BAR]) - self.assertEqual(255, ret) + self.assertEqual(1, ret) bar_stage_file = self.BAR + Stage.STAGE_FILE_SUFFIX self.assertFalse(os.path.exists(bar_stage_file)) diff --git a/tests/func/test_ignore.py b/tests/func/test_ignore.py index cbf59f96ad..1da50fb279 100644 --- a/tests/func/test_ignore.py +++ b/tests/func/test_ignore.py @@ -4,8 +4,7 @@ import pytest from dvc.exceptions import DvcIgnoreInCollectedDirError -from dvc.ignore import DvcIgnore -from dvc.repo import Repo +from dvc.ignore import DvcIgnore, DvcIgnoreFilter from dvc.utils.compat import cast_bytes from dvc.utils.fs import get_mtime_and_size from tests.basic_env import TestDvc @@ -30,14 +29,10 @@ def _get_all_paths(self): return paths - def _reload_dvc(self): - self.dvc = Repo(self.dvc.root_dir) - def test_ignore_in_child_dir(self): ignore_file = os.path.join(self.dvc.root_dir, DvcIgnore.DVCIGNORE_FILE) with open(ignore_file, "w") as fobj: fobj.write("data_dir/data") - self._reload_dvc() forbidden_path = os.path.join(self.dvc.root_dir, self.DATA) all_paths = self._get_all_paths() @@ -48,7 +43,6 @@ def test_ignore_in_child_dir_unicode(self): ignore_file = os.path.join(self.dvc.root_dir, DvcIgnore.DVCIGNORE_FILE) with open(ignore_file, "wb") as fobj: fobj.write(cast_bytes(self.UNICODE, "utf-8")) - self._reload_dvc() forbidden_path = os.path.join(self.dvc.root_dir, self.UNICODE) all_paths = self._get_all_paths() @@ -59,7 +53,6 @@ def test_ignore_in_parent_dir(self): ignore_file = os.path.join(self.dvc.root_dir, DvcIgnore.DVCIGNORE_FILE) with open(ignore_file, "w") as fobj: fobj.write("data_dir/data") - self._reload_dvc() os.chdir(self.DATA_DIR) @@ -71,6 +64,7 @@ def test_ignore_in_parent_dir(self): def test_metadata_unchanged_when_moving_ignored_file(dvc_repo, repo_dir): new_data_path = repo_dir.DATA_SUB + "_new" + dvcignore = DvcIgnoreFilter(root=dvc_repo.root_dir) ignore_file = os.path.join(dvc_repo.root_dir, DvcIgnore.DVCIGNORE_FILE) repo_dir.create( @@ -79,14 +73,15 @@ def test_metadata_unchanged_when_moving_ignored_file(dvc_repo, repo_dir): [to_posixpath(repo_dir.DATA_SUB), to_posixpath(new_data_path)] ), ) - dvc_repo = Repo(dvc_repo.root_dir) - mtime_sig, size = get_mtime_and_size(repo_dir.DATA_DIR, dvc_repo.dvcignore) + mtime_sig, size = get_mtime_and_size( + os.path.abspath(repo_dir.DATA_DIR), dvcignore + ) shutil.move(repo_dir.DATA_SUB, new_data_path) new_mtime_sig, new_size = get_mtime_and_size( - repo_dir.DATA_DIR, dvc_repo.dvcignore + os.path.abspath(repo_dir.DATA_DIR), dvcignore ) assert new_mtime_sig == mtime_sig @@ -95,10 +90,12 @@ def test_metadata_unchanged_when_moving_ignored_file(dvc_repo, repo_dir): def test_mtime_changed_when_moving_non_ignored_file(repo_dir): new_data_path = repo_dir.DATA_SUB + "_new" - mtime, size = get_mtime_and_size(repo_dir.DATA_DIR) + mtime, size = get_mtime_and_size(os.path.abspath(repo_dir.DATA_DIR)) shutil.move(repo_dir.DATA_SUB, new_data_path) - new_mtime, new_size = get_mtime_and_size(repo_dir.DATA_DIR) + new_mtime, new_size = get_mtime_and_size( + os.path.abspath(repo_dir.DATA_DIR) + ) assert new_mtime != mtime assert new_size == size @@ -107,12 +104,15 @@ def test_mtime_changed_when_moving_non_ignored_file(repo_dir): def test_metadata_unchanged_on_ignored_file_deletion(dvc_repo, repo_dir): ignore_file = os.path.join(dvc_repo.root_dir, DvcIgnore.DVCIGNORE_FILE) repo_dir.create(ignore_file, to_posixpath(repo_dir.DATA_SUB)) - dvc_repo = Repo(dvc_repo.root_dir) - mtime, size = get_mtime_and_size(repo_dir.DATA_DIR, dvc_repo.dvcignore) + dvcignore = DvcIgnoreFilter(root=dvc_repo.root_dir) + + mtime, size = get_mtime_and_size( + os.path.abspath(repo_dir.DATA_DIR), dvcignore + ) os.remove(repo_dir.DATA_SUB) new_mtime, new_size = get_mtime_and_size( - repo_dir.DATA_DIR, dvc_repo.dvcignore + os.path.abspath(repo_dir.DATA_DIR), dvcignore ) assert new_mtime == mtime @@ -120,44 +120,22 @@ def test_metadata_unchanged_on_ignored_file_deletion(dvc_repo, repo_dir): def test_metadata_changed_on_non_ignored_file_deletion(repo_dir): - mtime, size = get_mtime_and_size(repo_dir.DATA_DIR) + mtime, size = get_mtime_and_size(os.path.abspath(repo_dir.DATA_DIR)) os.remove(repo_dir.DATA_SUB) - new_mtime_sig, new_size = get_mtime_and_size(repo_dir.DATA_DIR) + new_mtime_sig, new_size = get_mtime_and_size( + os.path.abspath(repo_dir.DATA_DIR) + ) assert new_mtime_sig != mtime assert new_size != size -def test_should_ignore_for_external_dependency(dvc_repo, repo_dir): - external_data_dir = repo_dir.mkdtemp() - data = os.path.join(external_data_dir, "data_external") - ignored_file = "data_ignored" - ignored_full_path = os.path.join(external_data_dir, ignored_file) - ignore_file = os.path.join(dvc_repo.root_dir, DvcIgnore.DVCIGNORE_FILE) - repo_dir.create(data, "external_data_content") - repo_dir.create(ignored_full_path, "ignored_file_content") - repo_dir.create(ignore_file, "/" + ignored_full_path) - dvc_repo = Repo(dvc_repo.root_dir) - - stages = dvc_repo.add(external_data_dir) - assert len(stages) == 1 - - outs = stages[0].outs - assert len(outs) == 1 - - out_dir_cache = outs[0].dir_cache - assert len(out_dir_cache) == 1 - - assert out_dir_cache[0]["relpath"] == "data_external" - - def test_should_not_consider_dvcignore_after_adding(dvc_repo, repo_dir): ignore_file = os.path.join(repo_dir.DATA_DIR, DvcIgnore.DVCIGNORE_FILE) ignore_file_content = os.path.basename(repo_dir.DATA) repo_dir.create(ignore_file, ignore_file_content) - dvc_repo = Repo(dvc_repo.root_dir) with pytest.raises(DvcIgnoreInCollectedDirError): dvc_repo.add(repo_dir.DATA_DIR) diff --git a/tests/unit/test_ignore.py b/tests/unit/test_ignore.py index dd475b4239..ab8eba4400 100644 --- a/tests/unit/test_ignore.py +++ b/tests/unit/test_ignore.py @@ -1,12 +1,19 @@ import os +import dvc import pytest +from mock import mock_open, patch from dvc.ignore import DvcIgnorePatterns, DvcIgnoreDirs def mock_dvcignore(dvcignore_path, patterns): - ignore_file = DvcIgnorePatterns(dvcignore_path, patterns) + + with patch.object( + dvc.ignore, "open", mock_open(read_data="\n".join(patterns)) + ): + ignore_file = DvcIgnorePatterns(dvcignore_path) + return ignore_file From 9427b4e9ef690d7b70cf3e750dd085b469e91187 Mon Sep 17 00:00:00 2001 From: pawel Date: Mon, 8 Jul 2019 08:00:16 +0200 Subject: [PATCH 12/14] ignore: loading upper level ignores in stages() --- dvc/ignore.py | 41 +++++++--------------------- dvc/remote/base.py | 1 + dvc/repo/__init__.py | 4 +-- dvc/scm/tree.py | 1 - dvc/utils/__init__.py | 2 -- dvc/utils/fs.py | 1 + tests/func/test_ignore.py | 53 ++++++++++++++++++++++--------------- tests/func/test_repo.py | 3 --- tests/unit/utils/test_fs.py | 1 + 9 files changed, 47 insertions(+), 60 deletions(-) diff --git a/dvc/ignore.py b/dvc/ignore.py index 53c8074541..1cb380161c 100644 --- a/dvc/ignore.py +++ b/dvc/ignore.py @@ -20,39 +20,13 @@ def __call__(self, root, dirs, files): raise NotImplementedError -# def full_path_pattern(pattern, path): -# # NOTE: '/' is translated to ^ for .gitignore style patterns, -# # it is natural to proceed absolute path with this sign -# if pattern.startswith("//"): -# return pattern -# -# negation = False -# -# if pattern.startswith("!"): -# pattern = pattern[1:] -# negation = True -# -# if pattern.startswith("/"): -# pattern = os.path.normpath(path) + pattern -# else: -# pattern = os.path.join(path, pattern) -# -# pattern = os.path.join(path, pattern) -# pattern = "/" + pattern -# -# if negation: -# pattern = "!" + pattern -# -# return pattern - - class DvcIgnorePatterns(DvcIgnore): def __init__(self, ignore_file_path): self.ignore_file_path = ignore_file_path self.dirname = os.path.normpath(os.path.dirname(ignore_file_path)) with open(ignore_file_path, encoding="utf-8") as fobj: - self.rel_spec = PathSpec.from_lines(GitWildMatchPattern, fobj) + self.ignore_spec = PathSpec.from_lines(GitWildMatchPattern, fobj) def __call__(self, root, dirs, files): files = [f for f in files if not self.matches(root, f)] @@ -66,11 +40,14 @@ def matches(self, dirname, basename): if os.pardir + os.sep in rel_path: return False - return self.rel_spec.match_file(rel_path) + return self.ignore_spec.match_file(rel_path) def __hash__(self): return hash(self.ignore_file_path) + def __eq__(self, other): + return self.ignore_file_path == other.ignore_file_path + class DvcIgnoreDirs(DvcIgnore): def __init__(self, basenames): @@ -84,21 +61,23 @@ def __call__(self, root, dirs, files): class DvcIgnoreFilter(object): def __init__(self, root=None): - self.ignores = [DvcIgnoreDirs([".git", ".hg", ".dvc"])] + self.ignores = {DvcIgnoreDirs([".git", ".hg", ".dvc"])} self.root = root def load_upper_levels(self, top): top = os.path.abspath(top) if self.root: parent_dirs = get_parent_dirs_up_to(top, self.root) - parent_dirs.append(top) for d in parent_dirs: self.update(d) def update(self, dirname): ignore_file_path = os.path.join(dirname, DvcIgnore.DVCIGNORE_FILE) if os.path.exists(ignore_file_path): - self.ignores.append(DvcIgnorePatterns(ignore_file_path)) + local_ignore = DvcIgnorePatterns(ignore_file_path) + if local_ignore in self.ignores: + self.ignores.remove(local_ignore) + self.ignores.add(local_ignore) def __call__(self, root, dirs, files): for ignore in self.ignores: diff --git a/dvc/remote/base.py b/dvc/remote/base.py index 755274eeeb..1be9f3cc64 100644 --- a/dvc/remote/base.py +++ b/dvc/remote/base.py @@ -274,6 +274,7 @@ def get_checksum(self, path_info): and not self.exists(self.cache.checksum_to_path_info(checksum)) ): checksum = None + if checksum: return checksum diff --git a/dvc/repo/__init__.py b/dvc/repo/__init__.py index d26c18b281..7fc97c286d 100644 --- a/dvc/repo/__init__.py +++ b/dvc/repo/__init__.py @@ -360,8 +360,6 @@ def pipelines(self, from_directory=None): def make_out_dir_filter(self, outs, root_dir): def filter_dirs(dname): path = os.path.join(root_dir, dname) - if path in (self.dvc_dir, self.scm.dir): - return False for out in outs: if path == os.path.normpath(out) or path.startswith(out): return False @@ -389,6 +387,8 @@ def stages(self, from_directory=None, check_dag=True): stages = [] outs = [] + self.dvcignore.load_upper_levels(from_directory) + for root, dirs, files in self.tree.walk( from_directory, dvcignore=self.dvcignore ): diff --git a/dvc/scm/tree.py b/dvc/scm/tree.py index 090e8746a3..5866906969 100644 --- a/dvc/scm/tree.py +++ b/dvc/scm/tree.py @@ -72,7 +72,6 @@ def walk(self, top, topdown=True, dvcignore=None): from dvc.ignore import DvcIgnoreFilter dvcignore = DvcIgnoreFilter() - dvcignore.load_upper_levels(top) def onerror(e): raise e diff --git a/dvc/utils/__init__.py b/dvc/utils/__init__.py index 824689049f..b30b3ff18c 100644 --- a/dvc/utils/__init__.py +++ b/dvc/utils/__init__.py @@ -279,8 +279,6 @@ def dvc_walk( Proxy for `os.walk` directory tree generator. Utilizes DvcIgnoreFilter functionality. """ - if dvcignore: - dvcignore.load_upper_levels(top) for root, dirs, files in os.walk( top, topdown=topdown, onerror=onerror, followlinks=followlinks ): diff --git a/dvc/utils/fs.py b/dvc/utils/fs.py index 001c52328e..a03b22487a 100644 --- a/dvc/utils/fs.py +++ b/dvc/utils/fs.py @@ -81,6 +81,7 @@ def get_parent_dirs_up_to(wdir, root_dir): return [] dirs = [] + dirs.append(wdir) while wdir != root_dir: wdir = os.path.dirname(wdir) dirs.append(wdir) diff --git a/tests/func/test_ignore.py b/tests/func/test_ignore.py index 1da50fb279..9db3b81571 100644 --- a/tests/func/test_ignore.py +++ b/tests/func/test_ignore.py @@ -5,6 +5,7 @@ from dvc.exceptions import DvcIgnoreInCollectedDirError from dvc.ignore import DvcIgnore, DvcIgnoreFilter +from dvc.utils import walk_files from dvc.utils.compat import cast_bytes from dvc.utils.fs import get_mtime_and_size from tests.basic_env import TestDvc @@ -73,16 +74,13 @@ def test_metadata_unchanged_when_moving_ignored_file(dvc_repo, repo_dir): [to_posixpath(repo_dir.DATA_SUB), to_posixpath(new_data_path)] ), ) + dvcignore.load_upper_levels(repo_dir.DATA_DIR) - mtime_sig, size = get_mtime_and_size( - os.path.abspath(repo_dir.DATA_DIR), dvcignore - ) + mtime_sig, size = get_mtime_and_size(repo_dir.DATA_DIR, dvcignore) shutil.move(repo_dir.DATA_SUB, new_data_path) - new_mtime_sig, new_size = get_mtime_and_size( - os.path.abspath(repo_dir.DATA_DIR), dvcignore - ) + new_mtime_sig, new_size = get_mtime_and_size(repo_dir.DATA_DIR, dvcignore) assert new_mtime_sig == mtime_sig assert new_size == size @@ -90,12 +88,10 @@ def test_metadata_unchanged_when_moving_ignored_file(dvc_repo, repo_dir): def test_mtime_changed_when_moving_non_ignored_file(repo_dir): new_data_path = repo_dir.DATA_SUB + "_new" - mtime, size = get_mtime_and_size(os.path.abspath(repo_dir.DATA_DIR)) + mtime, size = get_mtime_and_size(repo_dir.DATA_DIR) shutil.move(repo_dir.DATA_SUB, new_data_path) - new_mtime, new_size = get_mtime_and_size( - os.path.abspath(repo_dir.DATA_DIR) - ) + new_mtime, new_size = get_mtime_and_size(repo_dir.DATA_DIR) assert new_mtime != mtime assert new_size == size @@ -105,33 +101,28 @@ def test_metadata_unchanged_on_ignored_file_deletion(dvc_repo, repo_dir): ignore_file = os.path.join(dvc_repo.root_dir, DvcIgnore.DVCIGNORE_FILE) repo_dir.create(ignore_file, to_posixpath(repo_dir.DATA_SUB)) dvcignore = DvcIgnoreFilter(root=dvc_repo.root_dir) + dvcignore.load_upper_levels(repo_dir.DATA_DIR) - mtime, size = get_mtime_and_size( - os.path.abspath(repo_dir.DATA_DIR), dvcignore - ) + mtime, size = get_mtime_and_size(repo_dir.DATA_DIR, dvcignore) os.remove(repo_dir.DATA_SUB) - new_mtime, new_size = get_mtime_and_size( - os.path.abspath(repo_dir.DATA_DIR), dvcignore - ) + new_mtime, new_size = get_mtime_and_size(repo_dir.DATA_DIR, dvcignore) assert new_mtime == mtime assert new_size == size def test_metadata_changed_on_non_ignored_file_deletion(repo_dir): - mtime, size = get_mtime_and_size(os.path.abspath(repo_dir.DATA_DIR)) + mtime, size = get_mtime_and_size(repo_dir.DATA_DIR) os.remove(repo_dir.DATA_SUB) - new_mtime_sig, new_size = get_mtime_and_size( - os.path.abspath(repo_dir.DATA_DIR) - ) + new_mtime_sig, new_size = get_mtime_and_size(repo_dir.DATA_DIR) assert new_mtime_sig != mtime assert new_size != size -def test_should_not_consider_dvcignore_after_adding(dvc_repo, repo_dir): +def test_should_raise_on_dvcignore_in_out_dir(dvc_repo, repo_dir): ignore_file = os.path.join(repo_dir.DATA_DIR, DvcIgnore.DVCIGNORE_FILE) ignore_file_content = os.path.basename(repo_dir.DATA) @@ -139,3 +130,23 @@ def test_should_not_consider_dvcignore_after_adding(dvc_repo, repo_dir): with pytest.raises(DvcIgnoreInCollectedDirError): dvc_repo.add(repo_dir.DATA_DIR) + + +def test_should_update_ignore_with_new_ignores(dvc_repo, repo_dir): + ignore_file = os.path.join(dvc_repo.root_dir, DvcIgnore.DVCIGNORE_FILE) + dvcignore = DvcIgnoreFilter() + + files = set(walk_files(repo_dir.DATA_DIR, dvcignore)) + assert files == {repo_dir.DATA, repo_dir.DATA_SUB} + + repo_dir.create(ignore_file, to_posixpath(repo_dir.DATA_SUB) + os.linesep) + dvcignore.update(os.path.dirname(ignore_file)) + + files = set(walk_files(repo_dir.DATA_DIR, dvcignore)) + assert files == {repo_dir.DATA} + + repo_dir.create(ignore_file, to_posixpath(repo_dir.DATA)) + dvcignore.update(os.path.dirname(ignore_file)) + + files = set(walk_files(repo_dir.DATA_DIR, dvcignore)) + assert files == set() diff --git a/tests/func/test_repo.py b/tests/func/test_repo.py index fdf36bd2a3..abd8ed3805 100644 --- a/tests/func/test_repo.py +++ b/tests/func/test_repo.py @@ -1,5 +1,4 @@ from dvc.main import main -from dvc.repo import Repo from dvc.stage import Stage from tests.basic_env import TestDvcGit @@ -61,8 +60,6 @@ def test_should_not_gather_stage_files_from_ignored_d(self): with open(".dvcignore", "w") as fobj: fobj.write("data_dir") - self.dvc = Repo(self.dvc.root_dir) - stages = self.dvc.stages() self.assertEqual(2, len(stages)) diff --git a/tests/unit/utils/test_fs.py b/tests/unit/utils/test_fs.py index 18a30e4f9d..abcb5bc25f 100644 --- a/tests/unit/utils/test_fs.py +++ b/tests/unit/utils/test_fs.py @@ -125,6 +125,7 @@ def test_get_parent_dirs_up_to_should_raise_on_no_absolute(path1, path2): [ os.path.join(os.sep, "some"), os.path.join(os.sep, "some", "long"), + os.path.join(os.sep, "some", "long", "path"), ], ), ], From 9b6cfd96a917d81b18f651f5542bf6d424a34a9a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20Redzy=C5=84ski?= Date: Tue, 9 Jul 2019 10:31:06 +0200 Subject: [PATCH 13/14] Update dvc/repo/__init__.py Co-Authored-By: Ruslan Kuprieiev --- dvc/repo/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dvc/repo/__init__.py b/dvc/repo/__init__.py index 7fc97c286d..8aae3419aa 100644 --- a/dvc/repo/__init__.py +++ b/dvc/repo/__init__.py @@ -357,7 +357,7 @@ def pipelines(self, from_directory=None): G.subgraph(c).copy() for c in nx.weakly_connected_components(G) ] - def make_out_dir_filter(self, outs, root_dir): + def _make_out_dir_filter(self, outs, root_dir): def filter_dirs(dname): path = os.path.join(root_dir, dname) for out in outs: From 9a362938881959c0770dbc3c876ead69f06f2235 Mon Sep 17 00:00:00 2001 From: pared Date: Tue, 9 Jul 2019 11:54:35 +0200 Subject: [PATCH 14/14] dvcignore review refactor --- dvc/exceptions.py | 6 ++-- dvc/ignore.py | 24 ++++++--------- dvc/remote/base.py | 1 + dvc/remote/local/__init__.py | 2 +- dvc/repo/__init__.py | 19 +++++++----- dvc/scm/tree.py | 10 ++---- dvc/utils/__init__.py | 20 +++++++----- dvc/utils/fs.py | 2 +- tests/func/test_checkout.py | 2 +- tests/func/test_ignore.py | 59 ++++++++++++------------------------ tests/func/test_repo.py | 8 +++-- tests/func/test_tree.py | 12 +++++--- tests/unit/test_ignore.py | 4 +-- tests/unit/utils/test_fs.py | 7 +++-- 14 files changed, 82 insertions(+), 94 deletions(-) diff --git a/dvc/exceptions.py b/dvc/exceptions.py index b47bbee7bc..da96fd0f9f 100644 --- a/dvc/exceptions.py +++ b/dvc/exceptions.py @@ -273,8 +273,8 @@ def __init__(self, path): class DvcIgnoreInCollectedDirError(DvcException): - def __init(self, ignore_dirname): + def __init__(self, ignore_dirname): super(DvcIgnoreInCollectedDirError, self).__init__( - ".dvcignore file should not be in collected dir path: '{" - "}'".format(ignore_dirname) + ".dvcignore file should not be in collected dir path: " + "'{}'".format(ignore_dirname) ) diff --git a/dvc/ignore.py b/dvc/ignore.py index 1cb380161c..b24c7824fe 100644 --- a/dvc/ignore.py +++ b/dvc/ignore.py @@ -6,7 +6,6 @@ from pathspec.patterns import GitWildMatchPattern from dvc.utils import relpath -from dvc.utils.fs import get_parent_dirs_up_to from dvc.utils.compat import open @@ -22,6 +21,8 @@ def __call__(self, root, dirs, files): class DvcIgnorePatterns(DvcIgnore): def __init__(self, ignore_file_path): + assert os.path.isabs(ignore_file_path) + self.ignore_file_path = ignore_file_path self.dirname = os.path.normpath(os.path.dirname(ignore_file_path)) @@ -60,24 +61,17 @@ def __call__(self, root, dirs, files): class DvcIgnoreFilter(object): - def __init__(self, root=None): + def __init__(self, root_dir): self.ignores = {DvcIgnoreDirs([".git", ".hg", ".dvc"])} - self.root = root - - def load_upper_levels(self, top): - top = os.path.abspath(top) - if self.root: - parent_dirs = get_parent_dirs_up_to(top, self.root) - for d in parent_dirs: - self.update(d) + self._update(root_dir) + for root, dirs, _ in os.walk(root_dir): + for d in dirs: + self._update(os.path.join(root, d)) - def update(self, dirname): + def _update(self, dirname): ignore_file_path = os.path.join(dirname, DvcIgnore.DVCIGNORE_FILE) if os.path.exists(ignore_file_path): - local_ignore = DvcIgnorePatterns(ignore_file_path) - if local_ignore in self.ignores: - self.ignores.remove(local_ignore) - self.ignores.add(local_ignore) + self.ignores.add(DvcIgnorePatterns(ignore_file_path)) def __call__(self, root, dirs, files): for ignore in self.ignores: diff --git a/dvc/remote/base.py b/dvc/remote/base.py index 1be9f3cc64..c4a88467d8 100644 --- a/dvc/remote/base.py +++ b/dvc/remote/base.py @@ -152,6 +152,7 @@ def _collect_dir(self, path_info): with ThreadPoolExecutor(max_workers=self.checksum_jobs) as executor: for root, _dirs, files in self.walk(path_info): root_info = path_info / root + for fname in files: if fname == DvcIgnore.DVCIGNORE_FILE: diff --git a/dvc/remote/local/__init__.py b/dvc/remote/local/__init__.py index 79afe2ef09..fa72369d5b 100644 --- a/dvc/remote/local/__init__.py +++ b/dvc/remote/local/__init__.py @@ -223,7 +223,7 @@ def isdir(self, path_info): return os.path.isdir(fspath_py35(path_info)) def walk(self, path_info): - return dvc_walk(fspath_py35(path_info), dvcignore=self.repo.dvcignore) + return dvc_walk(path_info, self.repo.dvcignore) def get_file_checksum(self, path_info): return file_md5(fspath_py35(path_info))[0] diff --git a/dvc/repo/__init__.py b/dvc/repo/__init__.py index 8aae3419aa..4ec38c0535 100644 --- a/dvc/repo/__init__.py +++ b/dvc/repo/__init__.py @@ -5,6 +5,8 @@ from itertools import chain +from funcy import cached_property + from dvc.config import Config from dvc.exceptions import ( NotDvcRepoError, @@ -66,7 +68,6 @@ def __init__(self, root_dir=None): self.scm = SCM(self.root_dir, repo=self) self.tree = WorkingTree(self.root_dir) - self.dvcignore = DvcIgnoreFilter(self.root_dir) self.lock = Lock(self.dvc_dir) # NOTE: storing state and link_state in the repository itself to avoid @@ -357,15 +358,16 @@ def pipelines(self, from_directory=None): G.subgraph(c).copy() for c in nx.weakly_connected_components(G) ] - def _make_out_dir_filter(self, outs, root_dir): + @staticmethod + def _filter_out_dirs(dirs, outs, root_dir): def filter_dirs(dname): path = os.path.join(root_dir, dname) for out in outs: - if path == os.path.normpath(out) or path.startswith(out): + if path == os.path.normpath(out): return False return True - return filter_dirs + return list(filter(filter_dirs, dirs)) def stages(self, from_directory=None, check_dag=True): """ @@ -387,8 +389,6 @@ def stages(self, from_directory=None, check_dag=True): stages = [] outs = [] - self.dvcignore.load_upper_levels(from_directory) - for root, dirs, files in self.tree.walk( from_directory, dvcignore=self.dvcignore ): @@ -402,8 +402,7 @@ def stages(self, from_directory=None, check_dag=True): outs.append(out.fspath + out.sep) stages.append(stage) - out_dir_filter = self.make_out_dir_filter(outs, root) - dirs[:] = list(filter(out_dir_filter, dirs)) + dirs[:] = self._filter_out_dirs(dirs, outs, root) if check_dag: self.check_dag(stages) @@ -471,3 +470,7 @@ def open(self, path, remote=None, mode="r", encoding=None): raise OutputFileMissingError(relpath(path, self.root_dir)) return _open(cache_file, mode=mode, encoding=encoding) + + @cached_property + def dvcignore(self): + return DvcIgnoreFilter(self.root_dir) diff --git a/dvc/scm/tree.py b/dvc/scm/tree.py index 5866906969..a90306f769 100644 --- a/dvc/scm/tree.py +++ b/dvc/scm/tree.py @@ -68,18 +68,12 @@ def walk(self, top, topdown=True, dvcignore=None): - it could raise exceptions, there is no onerror argument """ - if not dvcignore: - from dvc.ignore import DvcIgnoreFilter - - dvcignore = DvcIgnoreFilter() + assert dvcignore def onerror(e): raise e for root, dirs, files in dvc_walk( - os.path.abspath(top), - topdown=topdown, - onerror=onerror, - dvcignore=dvcignore, + os.path.abspath(top), dvcignore, topdown=topdown, onerror=onerror ): yield os.path.normpath(root), dirs, files diff --git a/dvc/utils/__init__.py b/dvc/utils/__init__.py index b30b3ff18c..ed04b8ace7 100644 --- a/dvc/utils/__init__.py +++ b/dvc/utils/__init__.py @@ -2,7 +2,14 @@ from __future__ import unicode_literals -from dvc.utils.compat import str, builtin_str, open, cast_bytes_py2, StringIO +from dvc.utils.compat import ( + str, + builtin_str, + open, + cast_bytes_py2, + StringIO, + fspath_py35, +) from dvc.utils.compat import fspath import os @@ -272,26 +279,25 @@ def to_yaml_string(data): return stream.getvalue() -def dvc_walk( - top, topdown=True, onerror=None, followlinks=False, dvcignore=None -): +def dvc_walk(top, dvcignore, topdown=True, onerror=None, followlinks=False): """ Proxy for `os.walk` directory tree generator. Utilizes DvcIgnoreFilter functionality. """ + top = fspath_py35(top) + for root, dirs, files in os.walk( top, topdown=topdown, onerror=onerror, followlinks=followlinks ): if dvcignore: - dvcignore.update(root) dirs[:], files[:] = dvcignore(root, dirs, files) yield root, dirs, files -def walk_files(directory, dvcignore=None): - for root, _, files in dvc_walk(directory, dvcignore=dvcignore): +def walk_files(directory, dvcignore): + for root, _, files in dvc_walk(directory, dvcignore): for f in files: yield os.path.join(root, f) diff --git a/dvc/utils/fs.py b/dvc/utils/fs.py index a03b22487a..811b4195a9 100644 --- a/dvc/utils/fs.py +++ b/dvc/utils/fs.py @@ -20,7 +20,7 @@ def get_inode(path): return inode -def get_mtime_and_size(path, dvcignore=None): +def get_mtime_and_size(path, dvcignore): if os.path.isdir(path): size = 0 files_mtimes = {} diff --git a/tests/func/test_checkout.py b/tests/func/test_checkout.py index c984ff4100..1aa7a83702 100644 --- a/tests/func/test_checkout.py +++ b/tests/func/test_checkout.py @@ -136,7 +136,7 @@ def outs_info(self, stage): paths = [ path for output in stage["outs"] - for path in walk_files(output["path"]) + for path in walk_files(output["path"], self.dvc.dvcignore) ] return [ diff --git a/tests/func/test_ignore.py b/tests/func/test_ignore.py index 9db3b81571..2b5fec7e91 100644 --- a/tests/func/test_ignore.py +++ b/tests/func/test_ignore.py @@ -4,8 +4,7 @@ import pytest from dvc.exceptions import DvcIgnoreInCollectedDirError -from dvc.ignore import DvcIgnore, DvcIgnoreFilter -from dvc.utils import walk_files +from dvc.ignore import DvcIgnore from dvc.utils.compat import cast_bytes from dvc.utils.fs import get_mtime_and_size from tests.basic_env import TestDvc @@ -65,7 +64,6 @@ def test_ignore_in_parent_dir(self): def test_metadata_unchanged_when_moving_ignored_file(dvc_repo, repo_dir): new_data_path = repo_dir.DATA_SUB + "_new" - dvcignore = DvcIgnoreFilter(root=dvc_repo.root_dir) ignore_file = os.path.join(dvc_repo.root_dir, DvcIgnore.DVCIGNORE_FILE) repo_dir.create( @@ -74,24 +72,27 @@ def test_metadata_unchanged_when_moving_ignored_file(dvc_repo, repo_dir): [to_posixpath(repo_dir.DATA_SUB), to_posixpath(new_data_path)] ), ) - dvcignore.load_upper_levels(repo_dir.DATA_DIR) - mtime_sig, size = get_mtime_and_size(repo_dir.DATA_DIR, dvcignore) + mtime_sig, size = get_mtime_and_size(repo_dir.DATA_DIR, dvc_repo.dvcignore) shutil.move(repo_dir.DATA_SUB, new_data_path) - new_mtime_sig, new_size = get_mtime_and_size(repo_dir.DATA_DIR, dvcignore) + new_mtime_sig, new_size = get_mtime_and_size( + repo_dir.DATA_DIR, dvc_repo.dvcignore + ) assert new_mtime_sig == mtime_sig assert new_size == size -def test_mtime_changed_when_moving_non_ignored_file(repo_dir): +def test_mtime_changed_when_moving_non_ignored_file(dvc_repo, repo_dir): new_data_path = repo_dir.DATA_SUB + "_new" - mtime, size = get_mtime_and_size(repo_dir.DATA_DIR) + mtime, size = get_mtime_and_size(repo_dir.DATA_DIR, dvc_repo.dvcignore) shutil.move(repo_dir.DATA_SUB, new_data_path) - new_mtime, new_size = get_mtime_and_size(repo_dir.DATA_DIR) + new_mtime, new_size = get_mtime_and_size( + repo_dir.DATA_DIR, dvc_repo.dvcignore + ) assert new_mtime != mtime assert new_size == size @@ -100,23 +101,25 @@ def test_mtime_changed_when_moving_non_ignored_file(repo_dir): def test_metadata_unchanged_on_ignored_file_deletion(dvc_repo, repo_dir): ignore_file = os.path.join(dvc_repo.root_dir, DvcIgnore.DVCIGNORE_FILE) repo_dir.create(ignore_file, to_posixpath(repo_dir.DATA_SUB)) - dvcignore = DvcIgnoreFilter(root=dvc_repo.root_dir) - dvcignore.load_upper_levels(repo_dir.DATA_DIR) - mtime, size = get_mtime_and_size(repo_dir.DATA_DIR, dvcignore) + mtime, size = get_mtime_and_size(repo_dir.DATA_DIR, dvc_repo.dvcignore) os.remove(repo_dir.DATA_SUB) - new_mtime, new_size = get_mtime_and_size(repo_dir.DATA_DIR, dvcignore) + new_mtime, new_size = get_mtime_and_size( + repo_dir.DATA_DIR, dvc_repo.dvcignore + ) assert new_mtime == mtime assert new_size == size -def test_metadata_changed_on_non_ignored_file_deletion(repo_dir): - mtime, size = get_mtime_and_size(repo_dir.DATA_DIR) +def test_metadata_changed_on_non_ignored_file_deletion(dvc_repo, repo_dir): + mtime, size = get_mtime_and_size(repo_dir.DATA_DIR, dvc_repo.dvcignore) os.remove(repo_dir.DATA_SUB) - new_mtime_sig, new_size = get_mtime_and_size(repo_dir.DATA_DIR) + new_mtime_sig, new_size = get_mtime_and_size( + repo_dir.DATA_DIR, dvc_repo.dvcignore + ) assert new_mtime_sig != mtime assert new_size != size @@ -124,29 +127,7 @@ def test_metadata_changed_on_non_ignored_file_deletion(repo_dir): def test_should_raise_on_dvcignore_in_out_dir(dvc_repo, repo_dir): ignore_file = os.path.join(repo_dir.DATA_DIR, DvcIgnore.DVCIGNORE_FILE) - ignore_file_content = os.path.basename(repo_dir.DATA) - - repo_dir.create(ignore_file, ignore_file_content) + repo_dir.create(ignore_file, "") with pytest.raises(DvcIgnoreInCollectedDirError): dvc_repo.add(repo_dir.DATA_DIR) - - -def test_should_update_ignore_with_new_ignores(dvc_repo, repo_dir): - ignore_file = os.path.join(dvc_repo.root_dir, DvcIgnore.DVCIGNORE_FILE) - dvcignore = DvcIgnoreFilter() - - files = set(walk_files(repo_dir.DATA_DIR, dvcignore)) - assert files == {repo_dir.DATA, repo_dir.DATA_SUB} - - repo_dir.create(ignore_file, to_posixpath(repo_dir.DATA_SUB) + os.linesep) - dvcignore.update(os.path.dirname(ignore_file)) - - files = set(walk_files(repo_dir.DATA_DIR, dvcignore)) - assert files == {repo_dir.DATA} - - repo_dir.create(ignore_file, to_posixpath(repo_dir.DATA)) - dvcignore.update(os.path.dirname(ignore_file)) - - files = set(walk_files(repo_dir.DATA_DIR, dvcignore)) - assert files == set() diff --git a/tests/func/test_repo.py b/tests/func/test_repo.py index abd8ed3805..8da5082ddb 100644 --- a/tests/func/test_repo.py +++ b/tests/func/test_repo.py @@ -1,4 +1,6 @@ +from dvc.ignore import DvcIgnore from dvc.main import main +from dvc.repo import Repo from dvc.stage import Stage from tests.basic_env import TestDvcGit @@ -50,16 +52,16 @@ class TestIgnore(TestDvcGit): def _stage_name(self, file): return file + Stage.STAGE_FILE_SUFFIX - def test_should_not_gather_stage_files_from_ignored_d(self): + def test_should_not_gather_stage_files_from_ignored_dir(self): ret = main(["add", self.FOO, self.BAR, self.DATA, self.DATA_SUB]) self.assertEqual(0, ret) stages = self.dvc.stages() self.assertEqual(4, len(stages)) - with open(".dvcignore", "w") as fobj: - fobj.write("data_dir") + self.create(DvcIgnore.DVCIGNORE_FILE, self.DATA_DIR) + self.dvc = Repo(self.dvc.root_dir) stages = self.dvc.stages() self.assertEqual(2, len(stages)) diff --git a/tests/func/test_tree.py b/tests/func/test_tree.py index 8e937d4161..7bec4ce71d 100644 --- a/tests/func/test_tree.py +++ b/tests/func/test_tree.py @@ -4,6 +4,7 @@ from os.path import join +from dvc.ignore import DvcIgnoreFilter from dvc.scm import SCM from dvc.scm.git import GitTree from dvc.scm.tree import WorkingTree @@ -97,9 +98,10 @@ def convert_to_sets(walk_results): class TestWalkInNoSCM(AssertWalkEqualMixin, TestDir): def test(self): + dvcignore = DvcIgnoreFilter(self.root_dir) tree = WorkingTree(self._root_dir) self.assertWalkEqual( - tree.walk(self._root_dir), + tree.walk(self._root_dir, dvcignore=dvcignore), [ ( self._root_dir, @@ -116,9 +118,10 @@ def test(self): ) def test_subdir(self): + dvcignore = DvcIgnoreFilter(self.root_dir) tree = WorkingTree(self._root_dir) self.assertWalkEqual( - tree.walk(join("data_dir", "data_sub_dir")), + tree.walk(join("data_dir", "data_sub_dir"), dvcignore=dvcignore), [ ( join(self._root_dir, "data_dir", "data_sub_dir"), @@ -132,8 +135,9 @@ def test_subdir(self): class TestWalkInGit(AssertWalkEqualMixin, TestGit): def test_nobranch(self): tree = WorkingTree(self._root_dir) + dvcignore = DvcIgnoreFilter(self._root_dir) self.assertWalkEqual( - tree.walk("."), + tree.walk(".", dvcignore=dvcignore), [ ( self._root_dir, @@ -149,7 +153,7 @@ def test_nobranch(self): ], ) self.assertWalkEqual( - tree.walk(join("data_dir", "data_sub_dir")), + tree.walk(join("data_dir", "data_sub_dir"), dvcignore=dvcignore), [ ( join(self._root_dir, "data_dir", "data_sub_dir"), diff --git a/tests/unit/test_ignore.py b/tests/unit/test_ignore.py index ab8eba4400..d27c9f1ec8 100644 --- a/tests/unit/test_ignore.py +++ b/tests/unit/test_ignore.py @@ -12,9 +12,9 @@ def mock_dvcignore(dvcignore_path, patterns): with patch.object( dvc.ignore, "open", mock_open(read_data="\n".join(patterns)) ): - ignore_file = DvcIgnorePatterns(dvcignore_path) + ignore_patterns = DvcIgnorePatterns(dvcignore_path) - return ignore_file + return ignore_patterns def test_ignore_from_file_should_filter_dirs_and_files(): diff --git a/tests/unit/utils/test_fs.py b/tests/unit/utils/test_fs.py index abcb5bc25f..30a96b63fc 100644 --- a/tests/unit/utils/test_fs.py +++ b/tests/unit/utils/test_fs.py @@ -3,6 +3,8 @@ import dvc import pytest + +from dvc.ignore import DvcIgnoreFilter from dvc.system import System from dvc.path_info import PathInfo from dvc.utils import relpath @@ -20,8 +22,9 @@ class TestMtimeAndSize(TestDir): def test(self): - file_time, file_size = get_mtime_and_size(self.DATA) - dir_time, dir_size = get_mtime_and_size(self.DATA_DIR) + dvcignore = DvcIgnoreFilter(self.root_dir) + file_time, file_size = get_mtime_and_size(self.DATA, dvcignore) + dir_time, dir_size = get_mtime_and_size(self.DATA_DIR, dvcignore) actual_file_size = os.path.getsize(self.DATA) actual_dir_size = os.path.getsize(self.DATA) + os.path.getsize(