From 9d0cfcafdfeeec37a021a9f6a065ec63f82ec5e2 Mon Sep 17 00:00:00 2001 From: Vera Sativa Date: Thu, 5 Dec 2019 11:05:47 -0300 Subject: [PATCH 01/12] feat: [WIP] remote dir download --- dvc/remote/base.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/dvc/remote/base.py b/dvc/remote/base.py index 10969250a0..77bca46940 100644 --- a/dvc/remote/base.py +++ b/dvc/remote/base.py @@ -547,11 +547,20 @@ def download( if to_info.scheme != "local": raise NotImplementedError - logger.debug("Downloading '{}' to '{}'".format(from_info, to_info)) + makedirs(to_info.parent, exist_ok=True, mode=dir_mode) + + if self.isdir(from_info): + # need to create dir here + # also pass the individual file name (file and to) + for file_info in self.walk_files(from_info): + self.single_file_download(file_info, to_info, name, no_progress_bar, file_mode) + else: + self.single_file_download(from_info, to_info, name, no_progress_bar, file_mode) + def single_file_download(self, from_info, to_info, name, no_progress_bar, file_mode): + logger.debug("Downloading '{}' to '{}'".format(from_info, to_info)) name = name or to_info.name - makedirs(to_info.parent, exist_ok=True, mode=dir_mode) tmp_file = tmp_fname(to_info) try: @@ -567,6 +576,7 @@ def download( return 0 + def open(self, path_info, mode="r", encoding=None): if hasattr(self, "_generate_download_url"): get_url = partial(self._generate_download_url, path_info) From f6c719a6f30f0c989b3c39d07746d609ab6b63bc Mon Sep 17 00:00:00 2001 From: Vera Sativa Date: Thu, 5 Dec 2019 14:53:01 -0300 Subject: [PATCH 02/12] feat: [WIP] dir download working single-thread --- dvc/remote/base.py | 51 ++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 43 insertions(+), 8 deletions(-) diff --git a/dvc/remote/base.py b/dvc/remote/base.py index 77bca46940..6813c55e0b 100644 --- a/dvc/remote/base.py +++ b/dvc/remote/base.py @@ -30,6 +30,7 @@ from dvc.utils.compat import urlparse from dvc.utils.fs import move from dvc.utils.http import open_url +from pathlib import Path logger = logging.getLogger(__name__) @@ -549,15 +550,50 @@ def download( makedirs(to_info.parent, exist_ok=True, mode=dir_mode) + print(from_info) + if self.isdir(from_info): - # need to create dir here - # also pass the individual file name (file and to) - for file_info in self.walk_files(from_info): - self.single_file_download(file_info, to_info, name, no_progress_bar, file_mode) + makedirs(to_info, exist_ok=True, mode=dir_mode) + # TEMP limits to test + limit = 5 + current = 0 + # / TEMP limits to test + for file_from_info in self.walk_files(from_info): + # TEMP limits to test + current += 1 + if current == limit: + exit() + # / TEMP limits to test + + file_to_info = Path( + str(to_info) + + "/" + + str(file_from_info)[len(str(from_info)) + 1 :] + ) + + self.single_file_download( + file_from_info, + file_to_info, + name, + no_progress_bar, + file_mode, + dir_mode, + ) else: - self.single_file_download(from_info, to_info, name, no_progress_bar, file_mode) + self.single_file_download( + from_info, to_info, name, no_progress_bar, file_mode, dir_mode + ) + + def single_file_download( + self, from_info, to_info, name, no_progress_bar, file_mode, dir_mode + ): + # Make sure full path exist + # print(to_info) + for parent in reversed(to_info.parents): + if not parent.exists(): + # print('making dir: {}'.format(parent)) + makedirs(parent, exist_ok=True, mode=dir_mode) - def single_file_download(self, from_info, to_info, name, no_progress_bar, file_mode): logger.debug("Downloading '{}' to '{}'".format(from_info, to_info)) name = name or to_info.name @@ -568,7 +604,7 @@ def single_file_download(self, from_info, to_info, name, no_progress_bar, file_m from_info, tmp_file, name=name, no_progress_bar=no_progress_bar ) except Exception: - msg = "failed to download '{}' to '{}'" + msg = "failed to doooooownload '{}' to '{}'" logger.exception(msg.format(from_info, to_info)) return 1 # 1 fail @@ -576,7 +612,6 @@ def single_file_download(self, from_info, to_info, name, no_progress_bar, file_m return 0 - def open(self, path_info, mode="r", encoding=None): if hasattr(self, "_generate_download_url"): get_url = partial(self._generate_download_url, path_info) From cf6da7bd8c18078196ee17cae1b017e3832a5e93 Mon Sep 17 00:00:00 2001 From: Vera Sativa Date: Thu, 5 Dec 2019 16:28:35 -0300 Subject: [PATCH 03/12] import-url: support directories: multi-thread working --- dvc/remote/base.py | 57 ++++++++++++++++++++-------------------------- 1 file changed, 25 insertions(+), 32 deletions(-) diff --git a/dvc/remote/base.py b/dvc/remote/base.py index 6813c55e0b..c22f30b5a1 100644 --- a/dvc/remote/base.py +++ b/dvc/remote/base.py @@ -30,7 +30,7 @@ from dvc.utils.compat import urlparse from dvc.utils.fs import move from dvc.utils.http import open_url -from pathlib import Path + logger = logging.getLogger(__name__) @@ -550,35 +550,33 @@ def download( makedirs(to_info.parent, exist_ok=True, mode=dir_mode) - print(from_info) - if self.isdir(from_info): makedirs(to_info, exist_ok=True, mode=dir_mode) - # TEMP limits to test - limit = 5 - current = 0 - # / TEMP limits to test - for file_from_info in self.walk_files(from_info): - # TEMP limits to test - current += 1 - if current == limit: - exit() - # / TEMP limits to test - - file_to_info = Path( - str(to_info) - + "/" - + str(file_from_info)[len(str(from_info)) + 1 :] - ) + file_to_infos = ( + to_info / file_to_info.relative_to(from_info) + for file_to_info in self.walk_files(from_info) + ) - self.single_file_download( + with ThreadPoolExecutor(max_workers=self.JOBS) as executor: + file_from_info = list(self.walk_files(from_info)) + with Tqdm( file_from_info, - file_to_info, - name, - no_progress_bar, - file_mode, - dir_mode, - ) + total=len(file_from_info), + desc="Downloading directory", + ) as file_from_info: + return sum( + executor.map( + partial( + self.single_file_download, + name=name, + no_progress_bar=True, + file_mode=file_mode, + dir_mode=dir_mode, + ), + file_from_info, + file_to_infos, + ) + ) else: self.single_file_download( from_info, to_info, name, no_progress_bar, file_mode, dir_mode @@ -587,12 +585,7 @@ def download( def single_file_download( self, from_info, to_info, name, no_progress_bar, file_mode, dir_mode ): - # Make sure full path exist - # print(to_info) - for parent in reversed(to_info.parents): - if not parent.exists(): - # print('making dir: {}'.format(parent)) - makedirs(parent, exist_ok=True, mode=dir_mode) + makedirs(to_info.parent, exist_ok=True, mode=dir_mode) logger.debug("Downloading '{}' to '{}'".format(from_info, to_info)) name = name or to_info.name From 3af4f85310588c19cbd80e0abb4cd4aad145a5e2 Mon Sep 17 00:00:00 2001 From: Vera <9991536+verasativa@users.noreply.github.com> Date: Thu, 5 Dec 2019 16:45:13 -0300 Subject: [PATCH 04/12] Update dvc/remote/base.py Co-Authored-By: Ruslan Kuprieiev --- dvc/remote/base.py | 1 - 1 file changed, 1 deletion(-) diff --git a/dvc/remote/base.py b/dvc/remote/base.py index c22f30b5a1..8922a5787e 100644 --- a/dvc/remote/base.py +++ b/dvc/remote/base.py @@ -548,7 +548,6 @@ def download( if to_info.scheme != "local": raise NotImplementedError - makedirs(to_info.parent, exist_ok=True, mode=dir_mode) if self.isdir(from_info): makedirs(to_info, exist_ok=True, mode=dir_mode) From 164ded64532c8e87afbeb35dc21814c184571572 Mon Sep 17 00:00:00 2001 From: Vera <9991536+verasativa@users.noreply.github.com> Date: Thu, 5 Dec 2019 16:45:43 -0300 Subject: [PATCH 05/12] Update dvc/remote/base.py Co-Authored-By: Ruslan Kuprieiev --- dvc/remote/base.py | 1 - 1 file changed, 1 deletion(-) diff --git a/dvc/remote/base.py b/dvc/remote/base.py index 8922a5787e..6cebec7c3a 100644 --- a/dvc/remote/base.py +++ b/dvc/remote/base.py @@ -550,7 +550,6 @@ def download( if self.isdir(from_info): - makedirs(to_info, exist_ok=True, mode=dir_mode) file_to_infos = ( to_info / file_to_info.relative_to(from_info) for file_to_info in self.walk_files(from_info) From f7e0b575bd9dfe568deb797dfb216afa3eacac67 Mon Sep 17 00:00:00 2001 From: Vera Sativa Date: Thu, 5 Dec 2019 16:51:24 -0300 Subject: [PATCH 06/12] tests: add download_dir --- tests/unit/remote/test_remote_dir.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/tests/unit/remote/test_remote_dir.py b/tests/unit/remote/test_remote_dir.py index 583898a60d..9e740a9684 100644 --- a/tests/unit/remote/test_remote_dir.py +++ b/tests/unit/remote/test_remote_dir.py @@ -1,5 +1,6 @@ # -*- coding: utf-8 -*- import pytest +import os from dvc.remote.s3 import RemoteS3 @@ -132,3 +133,12 @@ def test_isfile(remote): for expected, path in test_cases: assert remote.isfile(remote.path_info / path) == expected + + +@pytest.mark.parametrize("remote", remotes, indirect=True) +def test_download_dir(remote, tmpdir): + path = os.fspath(tmpdir / "data") + to_info = os.PathInfo(path) + remote.download(remote.path_info / "data", to_info) + assert os.path.isdir(path) + # check the list of files From 6f649db7f436bb984556776cfc52e93fcc26c44d Mon Sep 17 00:00:00 2001 From: Vera Sativa Date: Thu, 5 Dec 2019 17:39:38 -0300 Subject: [PATCH 07/12] import-url: support directories: code cleanup --- dvc/remote/base.py | 63 +++++++++++++++++++++++++--------------------- 1 file changed, 34 insertions(+), 29 deletions(-) diff --git a/dvc/remote/base.py b/dvc/remote/base.py index 6cebec7c3a..4c4e75f4c2 100644 --- a/dvc/remote/base.py +++ b/dvc/remote/base.py @@ -31,7 +31,6 @@ from dvc.utils.fs import move from dvc.utils.http import open_url - logger = logging.getLogger(__name__) STATUS_OK = 1 @@ -548,39 +547,45 @@ def download( if to_info.scheme != "local": raise NotImplementedError - if self.isdir(from_info): - file_to_infos = ( - to_info / file_to_info.relative_to(from_info) - for file_to_info in self.walk_files(from_info) + self._download_dir( + from_info, to_info, name, no_progress_bar, file_mode, dir_mode ) - - with ThreadPoolExecutor(max_workers=self.JOBS) as executor: - file_from_info = list(self.walk_files(from_info)) - with Tqdm( - file_from_info, - total=len(file_from_info), - desc="Downloading directory", - ) as file_from_info: - return sum( - executor.map( - partial( - self.single_file_download, - name=name, - no_progress_bar=True, - file_mode=file_mode, - dir_mode=dir_mode, - ), - file_from_info, - file_to_infos, - ) - ) else: - self.single_file_download( + self._download_file( from_info, to_info, name, no_progress_bar, file_mode, dir_mode ) - def single_file_download( + def _download_dir( + self, from_info, to_info, name, no_progress_bar, file_mode, dir_mode + ): + file_to_infos = ( + to_info / file_to_info.relative_to(from_info) + for file_to_info in self.walk_files(from_info) + ) + + with ThreadPoolExecutor(max_workers=self.JOBS) as executor: + file_from_info = list(self.walk_files(from_info)) + with Tqdm( + file_from_info, + total=len(file_from_info), + desc="Downloading directory", + ) as file_from_info: + return sum( + executor.map( + partial( + self._download_file, + name=name, + no_progress_bar=True, + file_mode=file_mode, + dir_mode=dir_mode, + ), + file_from_info, + file_to_infos, + ) + ) + + def _download_file( self, from_info, to_info, name, no_progress_bar, file_mode, dir_mode ): makedirs(to_info.parent, exist_ok=True, mode=dir_mode) @@ -595,7 +600,7 @@ def single_file_download( from_info, tmp_file, name=name, no_progress_bar=no_progress_bar ) except Exception: - msg = "failed to doooooownload '{}' to '{}'" + msg = "failed to download '{}' to '{}'" logger.exception(msg.format(from_info, to_info)) return 1 # 1 fail From 4e53f17948623ac527047da97c9c484ee8bbcb14 Mon Sep 17 00:00:00 2001 From: Vera <9991536+verasativa@users.noreply.github.com> Date: Thu, 5 Dec 2019 17:41:17 -0300 Subject: [PATCH 08/12] Update tests/unit/remote/test_remote_dir.py Co-Authored-By: Ruslan Kuprieiev --- tests/unit/remote/test_remote_dir.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/tests/unit/remote/test_remote_dir.py b/tests/unit/remote/test_remote_dir.py index 9e740a9684..f1e776760e 100644 --- a/tests/unit/remote/test_remote_dir.py +++ b/tests/unit/remote/test_remote_dir.py @@ -141,4 +141,12 @@ def test_download_dir(remote, tmpdir): to_info = os.PathInfo(path) remote.download(remote.path_info / "data", to_info) assert os.path.isdir(path) - # check the list of files + data_dir = tmpdir / "data" + assert len(list(walk_files(path, None))) == 7 + assert (data_dir / "alice").read_text(encoding="utf-8") == "alice" + assert (data_dir / "alpha").read_text(encoding="utf-8") == "alpha" + assert (data_dir / "subdir-file.txt").read_text(encoding="utf-8") == "subdir" + assert (data_dir / "subdir" / "1").read_text(encoding="utf-8") == "1" + assert (data_dir / "subdir" / "2").read_text(encoding="utf-8") == "2" + assert (data_dir / "subdir" / "3").read_text(encoding="utf-8") == "3" + assert (data_dir / "subdir" / "empty_file").read_text(encoding="utf-8") == "" From a0f18696dc2939517ef57629725e3c61dad0a3a8 Mon Sep 17 00:00:00 2001 From: Vera Sativa Date: Thu, 5 Dec 2019 18:03:03 -0300 Subject: [PATCH 09/12] refactor: partial into function var --- dvc/remote/base.py | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/dvc/remote/base.py b/dvc/remote/base.py index 4c4e75f4c2..d6c688bf1d 100644 --- a/dvc/remote/base.py +++ b/dvc/remote/base.py @@ -548,11 +548,11 @@ def download( raise NotImplementedError if self.isdir(from_info): - self._download_dir( + return self._download_dir( from_info, to_info, name, no_progress_bar, file_mode, dir_mode ) else: - self._download_file( + return self._download_file( from_info, to_info, name, no_progress_bar, file_mode, dir_mode ) @@ -566,23 +566,20 @@ def _download_dir( with ThreadPoolExecutor(max_workers=self.JOBS) as executor: file_from_info = list(self.walk_files(from_info)) + download_files = partial( + self._download_file, + name=name, + no_progress_bar=True, + file_mode=file_mode, + dir_mode=dir_mode, + ) with Tqdm( file_from_info, total=len(file_from_info), desc="Downloading directory", ) as file_from_info: return sum( - executor.map( - partial( - self._download_file, - name=name, - no_progress_bar=True, - file_mode=file_mode, - dir_mode=dir_mode, - ), - file_from_info, - file_to_infos, - ) + executor.map(download_files, file_from_info, file_to_infos) ) def _download_file( From 120f087f1c3bb4879e4be2069cf5617cced40fe6 Mon Sep 17 00:00:00 2001 From: Vera Sativa Date: Fri, 6 Dec 2019 10:04:05 -0300 Subject: [PATCH 10/12] fix: tqdm reading futures --- dvc/remote/base.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/dvc/remote/base.py b/dvc/remote/base.py index d6c688bf1d..c9365dd1c3 100644 --- a/dvc/remote/base.py +++ b/dvc/remote/base.py @@ -573,14 +573,16 @@ def _download_dir( file_mode=file_mode, dir_mode=dir_mode, ) + futures = executor.map( + download_files, file_from_info, file_to_infos + ) with Tqdm( - file_from_info, + futures, total=len(file_from_info), desc="Downloading directory", - ) as file_from_info: - return sum( - executor.map(download_files, file_from_info, file_to_infos) - ) + unit="Files", + ) as futures: + return sum(futures) def _download_file( self, from_info, to_info, name, no_progress_bar, file_mode, dir_mode From 3603da89ecc190017fc9f5887c19454d0570b0cd Mon Sep 17 00:00:00 2001 From: Vera Sativa Date: Fri, 6 Dec 2019 10:08:02 -0300 Subject: [PATCH 11/12] fix: respect no_progress_bar and remove human else --- dvc/remote/base.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/dvc/remote/base.py b/dvc/remote/base.py index c9365dd1c3..70e5266390 100644 --- a/dvc/remote/base.py +++ b/dvc/remote/base.py @@ -551,10 +551,9 @@ def download( return self._download_dir( from_info, to_info, name, no_progress_bar, file_mode, dir_mode ) - else: - return self._download_file( - from_info, to_info, name, no_progress_bar, file_mode, dir_mode - ) + return self._download_file( + from_info, to_info, name, no_progress_bar, file_mode, dir_mode + ) def _download_dir( self, from_info, to_info, name, no_progress_bar, file_mode, dir_mode @@ -581,6 +580,7 @@ def _download_dir( total=len(file_from_info), desc="Downloading directory", unit="Files", + disable=no_progress_bar, ) as futures: return sum(futures) From 5d4e05d074763c271bdf34d81414a111b10edf6c Mon Sep 17 00:00:00 2001 From: "Restyled.io" Date: Fri, 6 Dec 2019 13:19:40 +0000 Subject: [PATCH 12/12] Restyled by black --- tests/unit/remote/test_remote_dir.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/tests/unit/remote/test_remote_dir.py b/tests/unit/remote/test_remote_dir.py index f1e776760e..a62ffdca54 100644 --- a/tests/unit/remote/test_remote_dir.py +++ b/tests/unit/remote/test_remote_dir.py @@ -145,8 +145,12 @@ def test_download_dir(remote, tmpdir): assert len(list(walk_files(path, None))) == 7 assert (data_dir / "alice").read_text(encoding="utf-8") == "alice" assert (data_dir / "alpha").read_text(encoding="utf-8") == "alpha" - assert (data_dir / "subdir-file.txt").read_text(encoding="utf-8") == "subdir" + assert (data_dir / "subdir-file.txt").read_text( + encoding="utf-8" + ) == "subdir" assert (data_dir / "subdir" / "1").read_text(encoding="utf-8") == "1" assert (data_dir / "subdir" / "2").read_text(encoding="utf-8") == "2" assert (data_dir / "subdir" / "3").read_text(encoding="utf-8") == "3" - assert (data_dir / "subdir" / "empty_file").read_text(encoding="utf-8") == "" + assert (data_dir / "subdir" / "empty_file").read_text( + encoding="utf-8" + ) == ""