diff --git a/dvc/command/diff.py b/dvc/command/diff.py index 4842018a4a..6fd64e67da 100644 --- a/dvc/command/diff.py +++ b/dvc/command/diff.py @@ -5,6 +5,7 @@ import colorama +from dvc.command import completion from dvc.command.base import CmdBase, append_doc_link from dvc.exceptions import DvcException @@ -127,7 +128,9 @@ def _format(diff, hide_missing=False): def run(self): try: - diff = self.repo.diff(self.args.a_rev, self.args.b_rev) + diff = self.repo.diff( + self.args.a_rev, self.args.b_rev, self.args.targets + ) show_hash = self.args.show_hash hide_missing = self.args.b_rev or self.args.hide_missing if hide_missing: @@ -167,6 +170,14 @@ def add_parser(subparsers, parent_parser): help=DIFF_DESCRIPTION, formatter_class=argparse.RawDescriptionHelpFormatter, ) + diff_parser.add_argument( + "--targets", + nargs="*", + help=( + "Limit command scope to these tracked files or directories. " + "Accepts one or more file paths." + ), + ).complete = completion.FILE diff_parser.add_argument( "a_rev", help="Old Git commit to compare (defaults to HEAD)", diff --git a/dvc/repo/diff.py b/dvc/repo/diff.py index ba7a4c44ca..097d30d4ae 100644 --- a/dvc/repo/diff.py +++ b/dvc/repo/diff.py @@ -1,6 +1,7 @@ import logging import os +from dvc.exceptions import PathMissingError from dvc.repo import locked from dvc.tree.local import LocalTree from dvc.tree.repo import RepoTree @@ -9,7 +10,7 @@ @locked -def diff(self, a_rev="HEAD", b_rev=None): +def diff(self, a_rev="HEAD", b_rev=None, targets=None): """ By default, it compares the workspace with the last commit's tree. @@ -23,12 +24,28 @@ def diff(self, a_rev="HEAD", b_rev=None): b_rev = b_rev if b_rev else "workspace" results = {} + missing_targets = {} for rev in self.brancher(revs=[a_rev, b_rev]): if rev == "workspace" and rev != b_rev: # brancher always returns workspace, but we only need to compute # workspace paths/checksums if b_rev was None continue - results[rev] = _paths_checksums(self) + + targets_path_infos = None + if targets is not None: + # convert targets to path_infos, and capture any missing targets + targets_path_infos, missing_targets[rev] = _targets_to_path_infos( + self, targets + ) + + results[rev] = _paths_checksums(self, targets_path_infos) + + if targets is not None: + # check for overlapping missing targets between a_rev and b_rev + for target in set(missing_targets[a_rev]) & set( + missing_targets[b_rev] + ): + raise PathMissingError(target, self) old = results[a_rev] new = results[b_rev] @@ -62,7 +79,7 @@ def diff(self, a_rev="HEAD", b_rev=None): return ret if any(ret.values()) else {} -def _paths_checksums(repo): +def _paths_checksums(repo, targets): """ A dictionary of checksums addressed by relpaths collected from the current tree outputs. @@ -74,10 +91,10 @@ def _paths_checksums(repo): file: "data" """ - return dict(_output_paths(repo)) + return dict(_output_paths(repo, targets)) -def _output_paths(repo): +def _output_paths(repo, targets): repo_tree = RepoTree(repo, stream=True) on_working_tree = isinstance(repo.tree, LocalTree) @@ -101,17 +118,29 @@ def _to_checksum(output): for stage in repo.stages: for output in stage.outs: if _exists(output): - yield _to_path(output), _to_checksum(output) - if output.is_dir_checksum: - yield from _dir_output_paths(repo_tree, output) + yield_output = targets is None or any( + output.path_info.isin_or_eq(target) for target in targets + ) + if yield_output: + yield _to_path(output), _to_checksum(output) -def _dir_output_paths(repo_tree, output): + if output.is_dir_checksum and ( + yield_output + or any(target.isin(output.path_info) for target in targets) + ): + yield from _dir_output_paths(repo_tree, output, targets) + + +def _dir_output_paths(repo_tree, output, targets=None): from dvc.config import NoRemoteError try: for fname in repo_tree.walk_files(output.path_info): - yield str(fname), repo_tree.get_file_hash(fname).value + if targets is None or any( + fname.isin_or_eq(target) for target in targets + ): + yield str(fname), repo_tree.get_file_hash(fname).value except NoRemoteError: logger.warning("dir cache entry for '%s' is missing", output) @@ -124,3 +153,18 @@ def _filter_missing(repo, paths): out = metadata.outs[0] if out.status().get(str(out)) == "not in cache": yield path + + +def _targets_to_path_infos(repo, targets): + path_infos = [] + missing = [] + + repo_tree = RepoTree(repo, stream=True) + + for target in targets: + if repo_tree.exists(target): + path_infos.append(repo_tree.metadata(target).path_info) + else: + missing.append(target) + + return path_infos, missing diff --git a/tests/func/test_diff.py b/tests/func/test_diff.py index 6f296aa35e..63cd02c38f 100644 --- a/tests/func/test_diff.py +++ b/tests/func/test_diff.py @@ -264,3 +264,141 @@ def test_no_commits(tmp_dir): assert Git().no_commits assert Repo.init().diff() == {} + + +def setup_targets_test(tmp_dir): + tmp_dir.dvc_gen("file", "first", commit="add a file") + + tmp_dir.dvc_gen({"dir": {"1": "1", "2": "2"}}) + tmp_dir.dvc_gen("file", "second") + + tmp_dir.dvc_gen(os.path.join("dir_with", "file.txt"), "first") + + +def test_targets_missing_path(tmp_dir, scm, dvc): + from dvc.exceptions import PathMissingError + + setup_targets_test(tmp_dir) + + with pytest.raises(PathMissingError): + dvc.diff(targets=["missing"]) + + +def test_targets_single_file(tmp_dir, scm, dvc): + setup_targets_test(tmp_dir) + + assert dvc.diff(targets=["file"]) == { + "added": [], + "deleted": [], + "modified": [ + { + "path": "file", + "hash": {"old": digest("first"), "new": digest("second")}, + } + ], + "not in cache": [], + } + + +def test_targets_single_dir(tmp_dir, scm, dvc): + setup_targets_test(tmp_dir) + + dir_checksum = "5fb6b29836c388e093ca0715c872fe2a.dir" + + expected_result = { + "added": [ + {"path": os.path.join("dir", ""), "hash": dir_checksum}, + {"path": os.path.join("dir", "1"), "hash": digest("1")}, + {"path": os.path.join("dir", "2"), "hash": digest("2")}, + ], + "deleted": [], + "modified": [], + "not in cache": [], + } + + assert dvc.diff(targets=["dir"]) == expected_result + assert dvc.diff(targets=["dir" + os.path.sep]) == expected_result + + +def test_targets_single_file_in_dir(tmp_dir, scm, dvc): + setup_targets_test(tmp_dir) + + assert dvc.diff(targets=[os.path.join("dir", "1")]) == { + "added": [{"path": os.path.join("dir", "1"), "hash": digest("1")}], + "deleted": [], + "modified": [], + "not in cache": [], + } + + +def test_targets_two_files_in_dir(tmp_dir, scm, dvc): + setup_targets_test(tmp_dir) + + assert dvc.diff( + targets=[os.path.join("dir", "1"), os.path.join("dir", "2")] + ) == { + "added": [ + {"path": os.path.join("dir", "1"), "hash": digest("1")}, + {"path": os.path.join("dir", "2"), "hash": digest("2")}, + ], + "deleted": [], + "modified": [], + "not in cache": [], + } + + +def test_targets_file_and_dir(tmp_dir, scm, dvc): + setup_targets_test(tmp_dir) + + dir_checksum = "5fb6b29836c388e093ca0715c872fe2a.dir" + + assert dvc.diff(targets=["file", "dir"]) == { + "added": [ + {"path": os.path.join("dir", ""), "hash": dir_checksum}, + {"path": os.path.join("dir", "1"), "hash": digest("1")}, + {"path": os.path.join("dir", "2"), "hash": digest("2")}, + ], + "deleted": [], + "modified": [ + { + "path": "file", + "hash": {"old": digest("first"), "new": digest("second")}, + } + ], + "not in cache": [], + } + + +def test_targets_single_dir_with_file(tmp_dir, scm, dvc): + setup_targets_test(tmp_dir) + + expected_result = { + "added": [ + { + "path": os.path.join("dir_with", "file.txt"), + "hash": digest("first"), + }, + ], + "deleted": [], + "modified": [], + "not in cache": [], + } + + assert dvc.diff(targets=["dir_with"]) == expected_result + assert dvc.diff(targets=["dir_with" + os.path.sep]) == expected_result + + +def test_targets_single_file_in_dir_with_file(tmp_dir, scm, dvc): + setup_targets_test(tmp_dir) + + assert dvc.diff(targets=[os.path.join("dir_with", "file.txt")]) == { + "added": [ + { + "path": os.path.join("dir_with", "file.txt"), + "hash": digest("first"), + }, + ], + "deleted": [], + "modified": [], + "not in cache": [], + }