Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions dvc/command/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,11 +42,11 @@ def __init__(self, args):
@property
def default_targets(self):
"""Default targets for `dvc repro` and `dvc pipeline`."""
from dvc.stage import Stage
from dvc.dvcfile import DVC_FILE

msg = "assuming default target '{}'.".format(Stage.STAGE_FILE)
msg = "assuming default target '{}'.".format(DVC_FILE)
logger.warning(msg)
return [Stage.STAGE_FILE]
return [DVC_FILE]

# Abstract methods that have to be implemented by any inheritance class
def run(self):
Expand Down
8 changes: 4 additions & 4 deletions dvc/command/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,9 @@
class CmdPipelineShow(CmdBase):
def _show(self, target, commands, outs, locked):
import networkx
from dvc.stage import Stage
from dvc.dvcfile import Dvcfile

stage = Stage.load(self.repo, target)
stage = Dvcfile(self.repo, target).load()
G = self.repo.graph
stages = networkx.dfs_postorder_nodes(G, stage)

Expand All @@ -33,10 +33,10 @@ def _show(self, target, commands, outs, locked):

def _build_graph(self, target, commands, outs):
import networkx
from dvc.stage import Stage
from dvc.dvcfile import Dvcfile
from dvc.repo.graph import get_pipeline

target_stage = Stage.load(self.repo, target)
target_stage = Dvcfile(self.repo, target).load()
G = get_pipeline(self.repo.pipelines, target_stage)

nodes = set()
Expand Down
7 changes: 0 additions & 7 deletions dvc/command/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,6 @@ def run(self):
no_exec=self.args.no_exec,
overwrite=overwrite,
ignore_build_cache=self.args.ignore_build_cache,
remove_outs=self.args.remove_outs,
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Removed this deprecated flag.

no_commit=self.args.no_commit,
outs_persist=self.args.outs_persist,
outs_persist_no_cache=self.args.outs_persist_no_cache,
Expand Down Expand Up @@ -168,12 +167,6 @@ def add_parser(subparsers, parent_parser):
help="Run this stage even if it has been already ran with the same "
"command/dependencies/outputs/etc before.",
)
run_parser.add_argument(
"--remove-outs",
action="store_true",
default=False,
help="Deprecated, this is now the default behavior",
)
run_parser.add_argument(
"--no-commit",
action="store_true",
Expand Down
174 changes: 174 additions & 0 deletions dvc/dvcfile.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,174 @@
import os
import re
import logging

import dvc.prompt as prompt

from voluptuous import MultipleInvalid

from dvc import dependency, output
from dvc.stage.exceptions import (
StageFileBadNameError,
StageFileDoesNotExistError,
StageFileIsNotDvcFileError,
StageFileFormatError,
StageFileAlreadyExistsError,
)
from dvc.utils import relpath
from dvc.utils.collections import apply_diff
from dvc.utils.stage import (
parse_stage_for_update,
dump_stage_file,
parse_stage,
)

logger = logging.getLogger(__name__)

DVC_FILE = "Dvcfile"
DVC_FILE_SUFFIX = ".dvc"
TAG_REGEX = r"^(?P<path>.*)@(?P<tag>[^\\/@:]*)$"


class Dvcfile:
def __init__(self, repo, path):
self.repo = repo
self.path, self.tag = self._get_path_tag(path)

def __repr__(self):
return "{}: {}".format(DVC_FILE, self.path)

@classmethod
def is_valid_filename(cls, path):
return (
path.endswith(DVC_FILE_SUFFIX)
or os.path.basename(path) == DVC_FILE
)

@classmethod
def is_stage_file(cls, path):
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This kind of naming will be changing on upcoming PRs. For now, stage_file and dvcfile are essentially the same thing and should work.

return os.path.isfile(path) and cls.is_valid_filename(path)

@classmethod
def check_dvc_filename(cls, path):
if not cls.is_valid_filename(path):
raise StageFileBadNameError(
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Again, exceptions will probably change wherever this does not make sense in future PRs.

"bad DVC-file name '{}'. DVC-files should be named "
"'Dvcfile' or have a '.dvc' suffix (e.g. '{}.dvc').".format(
relpath(path), os.path.basename(path)
)
)

def exists(self):
return self.repo.tree.exists(self.path)

def check_file_exists(self):
if not self.exists():
raise StageFileDoesNotExistError(self.path)

def check_isfile(self):
if not self.repo.tree.isfile(self.path):
raise StageFileIsNotDvcFileError(self.path)

@staticmethod
def _get_path_tag(s):
regex = re.compile(TAG_REGEX)
match = regex.match(s)
if not match:
return s, None
return match.group("path"), match.group("tag")

def dump(self, stage):
"""Dumps given stage appropriately in the dvcfile."""
self.dump_single_stage(stage)

def dump_single_stage(self, stage):
self.check_dvc_filename(self.path)

logger.debug(
"Saving information to '{file}'.".format(file=relpath(self.path))
)
state = stage.dumpd()

# When we load a stage we parse yaml with a fast parser, which strips
# off all the comments and formatting. To retain those on update we do
# a trick here:
# - reparse the same yaml text with a slow but smart ruamel yaml parser
# - apply changes to a returned structure
# - serialize it
if stage._stage_text is not None:
saved_state = parse_stage_for_update(stage._stage_text, self.path)
# Stage doesn't work with meta in any way, so .dumpd() doesn't
# have it. We simply copy it over.
if "meta" in saved_state:
state["meta"] = saved_state["meta"]
apply_diff(state, saved_state)
state = saved_state

dump_stage_file(self.path, state)

self.repo.scm.track_file(relpath(self.path))

def load(self):
"""Loads single stage."""
from dvc.stage import Stage

# it raises the proper exceptions by priority:
# 1. when the file doesn't exists
# 2. filename is not a DVC-file
# 3. path doesn't represent a regular file
self.check_file_exists()
self.check_dvc_filename(self.path)
self.check_isfile()

with self.repo.tree.open(self.path) as fd:
stage_text = fd.read()
d = parse_stage(stage_text, self.path)

Dvcfile.validate(d, fname=relpath(self.path))
path = os.path.abspath(self.path)

stage = Stage(
repo=self.repo,
path=path,
wdir=os.path.abspath(
os.path.join(
os.path.dirname(path), d.get(Stage.PARAM_WDIR, ".")
)
),
cmd=d.get(Stage.PARAM_CMD),
md5=d.get(Stage.PARAM_MD5),
locked=d.get(Stage.PARAM_LOCKED, False),
tag=self.tag,
always_changed=d.get(Stage.PARAM_ALWAYS_CHANGED, False),
# We store stage text to apply updates to the same structure
stage_text=stage_text,
)

stage.deps = dependency.loadd_from(
stage, d.get(Stage.PARAM_DEPS) or []
)
stage.outs = output.loadd_from(stage, d.get(Stage.PARAM_OUTS) or [])

return stage

@staticmethod
def validate(d, fname=None):
from dvc.stage.schema import SINGLE_STAGE_SCHEMA

try:
SINGLE_STAGE_SCHEMA(d)
except MultipleInvalid as exc:
raise StageFileFormatError(fname, exc)

def overwrite_with_prompt(self, force=False):
if not self.exists():
return

msg = (
"'{}' already exists. Do you wish to run the command and "
"overwrite it?".format(self.path)
)
if not (force or prompt.confirm(msg)):
raise StageFileAlreadyExistsError(self.path)

os.unlink(self.path)
4 changes: 2 additions & 2 deletions dvc/output/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -442,7 +442,7 @@ def get_used_cache(self, **kwargs):

@classmethod
def _validate_output_path(cls, path):
from dvc.stage import Stage
from dvc.dvcfile import Dvcfile

if Stage.is_valid_filename(path):
if Dvcfile.is_valid_filename(path):
raise cls.IsStageFileError(path)
16 changes: 8 additions & 8 deletions dvc/repo/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,7 +193,7 @@ def check_modified_graph(self, new_stages):

def collect(self, target, with_deps=False, recursive=False, graph=None):
import networkx as nx
from dvc.stage import Stage
from ..dvcfile import Dvcfile

if not target:
return list(graph) if graph else self.stages
Expand All @@ -204,7 +204,7 @@ def collect(self, target, with_deps=False, recursive=False, graph=None):
stages = nx.dfs_postorder_nodes(graph or self.graph)
return [stage for stage in stages if path_isin(stage.path, target)]

stage = Stage.load(self, target)
stage = Dvcfile(self, target).load()

# Optimization: do not collect the graph for a specific target
if not with_deps:
Expand All @@ -214,14 +214,14 @@ def collect(self, target, with_deps=False, recursive=False, graph=None):
return list(nx.dfs_postorder_nodes(pipeline, stage))

def collect_granular(self, target, *args, **kwargs):
from dvc.stage import Stage
from ..dvcfile import Dvcfile

if not target:
return [(stage, None) for stage in self.stages]

# Optimization: do not collect the graph for a specific .dvc target
if Stage.is_valid_filename(target) and not kwargs.get("with_deps"):
return [(Stage.load(self, target), None)]
if Dvcfile.is_valid_filename(target) and not kwargs.get("with_deps"):
return [(Dvcfile(self, target).load(), None)]

try:
(out,) = self.find_outs_by_path(target, strict=False)
Expand Down Expand Up @@ -411,17 +411,17 @@ def stages(self):
NOTE: For large repos, this could be an expensive
operation. Consider using some memoization.
"""
from dvc.stage import Stage
from ..dvcfile import Dvcfile

stages = []
outs = set()

for root, dirs, files in self.tree.walk(self.root_dir):
for fname in files:
path = os.path.join(root, fname)
if not Stage.is_valid_filename(path):
if not Dvcfile.is_valid_filename(path):
continue
stage = Stage.load(self, path)
stage = Dvcfile(self, path).load()
stages.append(stage)

for out in stage.outs:
Expand Down
11 changes: 8 additions & 3 deletions dvc/repo/add.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,14 @@
import colorama

from . import locked
from ..dvcfile import Dvcfile
from ..exceptions import (
RecursiveAddingWhileUsingFilename,
OverlappingOutputPathsError,
)
from ..output.base import OutputDoesNotExistError
from ..progress import Tqdm
from ..repo.scm_context import scm_context
from ..stage import Stage
from ..utils import LARGE_DIR_SIZE, resolve_paths

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -85,7 +85,7 @@ def add(repo, targets, recursive=False, no_commit=False, fname=None):
if not no_commit:
stage.commit()

stage.dump()
Dvcfile(repo, stage.path).dump(stage)
pbar_stages.update()

stages_list += stages
Expand All @@ -107,14 +107,16 @@ def _find_all_targets(repo, target, recursive):
unit="file",
)
if not repo.is_dvc_internal(fname)
if not Stage.is_stage_file(fname)
if not Dvcfile.is_stage_file(fname)
if not repo.scm.belongs_to_scm(fname)
if not repo.scm.is_tracked(fname)
]
return [target]


def _create_stages(repo, targets, fname, pbar=None):
from dvc.stage import Stage

stages = []

for out in Tqdm(
Expand All @@ -125,6 +127,9 @@ def _create_stages(repo, targets, fname, pbar=None):
):
path, wdir, out = resolve_paths(repo, out)
stage = Stage.create(repo, fname or path, wdir=wdir, outs=[out])
if stage:
Dvcfile(repo, stage.path).overwrite_with_prompt(force=True)

repo._reset()

if not stage:
Expand Down
8 changes: 5 additions & 3 deletions dvc/repo/checkout.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,7 @@
import os

from dvc.compat import fspath
from dvc.exceptions import CheckoutError
from dvc.exceptions import CheckoutErrorSuggestGit
from dvc.exceptions import CheckoutError, CheckoutErrorSuggestGit
from dvc.progress import Tqdm
from dvc.utils import relpath

Expand Down Expand Up @@ -42,7 +41,10 @@ def _checkout(
relink=False,
recursive=False,
):
from dvc.stage import StageFileDoesNotExistError, StageFileBadNameError
from dvc.stage.exceptions import (
StageFileBadNameError,
StageFileDoesNotExistError,
)

unused = []
stats = {
Expand Down
4 changes: 3 additions & 1 deletion dvc/repo/commit.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from . import locked
from dvc.dvcfile import Dvcfile


@locked
Expand All @@ -7,4 +8,5 @@ def commit(self, target, with_deps=False, recursive=False, force=False):
for stage in stages:
stage.check_can_commit(force=force)
stage.commit()
stage.dump()

Dvcfile(self, stage.path).dump(stage)
Loading