From b279fb2ff702e16ae555087df42196b04e8d14cf Mon Sep 17 00:00:00 2001 From: Max Risuhin Date: Sun, 29 Sep 2019 09:58:26 -0700 Subject: [PATCH 01/33] Basic GDrive remote support --- MANIFEST.in | 1 + dvc/remote/__init__.py | 2 + dvc/remote/gdrive/__init__.py | 144 ++++++++++++++++++ dvc/remote/gdrive/settings.yaml | 14 ++ dvc/remote/gdrive/utils.py | 58 +++++++ dvc/scheme.py | 1 + .../068b8e92002dd24414a9995a80726a14.enc | Bin 0 -> 496 bytes .../589e2f63a0de57566be6c247074399db.enc | Bin 0 -> 496 bytes setup.py | 3 + tests/conftest.py | 9 ++ tests/func/test_data_cloud.py | 38 +++++ tests/func/test_gdrive.py | 79 ++++++++++ tests/unit/remote/gdrive/__init__.py | 0 tests/unit/remote/gdrive/conftest.py | 15 ++ tests/unit/remote/gdrive/test_gdrive.py | 13 ++ tests/unit/remote/gdrive/test_utils.py | 0 16 files changed, 377 insertions(+) create mode 100644 dvc/remote/gdrive/__init__.py create mode 100644 dvc/remote/gdrive/settings.yaml create mode 100644 dvc/remote/gdrive/utils.py create mode 100644 scripts/ci/gdrive-oauth2/068b8e92002dd24414a9995a80726a14.enc create mode 100644 scripts/ci/gdrive-oauth2/589e2f63a0de57566be6c247074399db.enc create mode 100644 tests/func/test_gdrive.py create mode 100644 tests/unit/remote/gdrive/__init__.py create mode 100644 tests/unit/remote/gdrive/conftest.py create mode 100644 tests/unit/remote/gdrive/test_gdrive.py create mode 100644 tests/unit/remote/gdrive/test_utils.py diff --git a/MANIFEST.in b/MANIFEST.in index 03d2f17a9a..35e43bb296 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,2 +1,3 @@ include fastentrypoints.py include LICENSE +include dvc/remote/gdrive/settings.yaml diff --git a/dvc/remote/__init__.py b/dvc/remote/__init__.py index e8ffe81f45..f14ed9f5d4 100644 --- a/dvc/remote/__init__.py +++ b/dvc/remote/__init__.py @@ -1,6 +1,7 @@ from __future__ import unicode_literals from dvc.remote.azure import RemoteAZURE +from dvc.remote.gdrive import RemoteGDrive from dvc.remote.gs import RemoteGS from dvc.remote.hdfs import RemoteHDFS from dvc.remote.local import RemoteLOCAL @@ -15,6 +16,7 @@ REMOTES = [ RemoteAZURE, + RemoteGDrive, RemoteGS, RemoteHDFS, RemoteHTTP, diff --git a/dvc/remote/gdrive/__init__.py b/dvc/remote/gdrive/__init__.py new file mode 100644 index 0000000000..f11734bdc0 --- /dev/null +++ b/dvc/remote/gdrive/__init__.py @@ -0,0 +1,144 @@ +from __future__ import unicode_literals + +import os +import logging + +try: + from pydrive.auth import GoogleAuth + from pydrive.drive import GoogleDrive +except ImportError: + GoogleAuth = None + GoogleDrive = None + +from dvc.scheme import Schemes +from dvc.path_info import CloudURLInfo +from dvc.remote.base import RemoteBASE +from dvc.config import Config +from dvc.remote.gdrive.utils import TrackFileReadProgress +from dvc.progress import Tqdm + + +logger = logging.getLogger(__name__) + + +class GDriveURLInfo(CloudURLInfo): + @property + def netloc(self): + return self.parsed.netloc + + +class RemoteGDrive(RemoteBASE): + scheme = Schemes.GDRIVE + path_cls = GDriveURLInfo + REGEX = r"^gdrive://.*$" + REQUIRES = {"pydrive": "pydrive"} + PARAM_CHECKSUM = "md5Checksum" + GOOGLE_AUTH_SETTINGS_PATH = os.path.join( + os.path.dirname(__file__), "settings.yaml" + ) + + def __init__(self, repo, config): + super(RemoteGDrive, self).__init__(repo, config) + self.path_info = self.path_cls(config[Config.SECTION_REMOTE_URL]) + self.root_content_cached = False + self.root_dirs_list = {} + self.init_gdrive() + + def init_gdrive(self): + self.gdrive = self.drive() + self.cache_root_content() + + def drive(self): + GoogleAuth.DEFAULT_SETTINGS["client_config_backend"] = "settings" + gauth = GoogleAuth(settings_file=self.GOOGLE_AUTH_SETTINGS_PATH) + gauth.CommandLineAuth() + return GoogleDrive(gauth) + + def cache_root_content(self): + if not self.root_content_cached: + for dirs_list in self.gdrive.ListFile( + { + "q": "'%s' in parents and trashed=false" + % self.path_info.netloc, + "maxResults": 256, + } + ): + for dir1 in dirs_list: + self.root_dirs_list[dir1["title"]] = dir1["id"] + self.root_content_cached = True + + def get_path_id(self, path_info, create=False): + file_id = "" + parts = path_info.path.split("/") + + if parts and (parts[0] in self.root_dirs_list): + parent_id = self.root_dirs_list[parts[0]] + file_id = self.root_dirs_list[parts[0]] + parts.pop(0) + else: + parent_id = path_info.netloc + file_list = self.gdrive.ListFile( + {"q": "'%s' in parents and trashed=false" % parent_id} + ).GetList() + + for part in parts: + file_id = "" + for f in file_list: + if f["title"] == part: + file_id = f["id"] + file_list = self.gdrive.ListFile( + {"q": "'%s' in parents and trashed=false" % file_id} + ).GetList() + parent_id = f["id"] + break + if file_id == "": + if create: + gdrive_file = self.gdrive.CreateFile( + { + "title": part, + "parents": [{"id": parent_id}], + "mimeType": "application/vnd.google-apps.folder", + } + ) + gdrive_file.Upload() + file_id = gdrive_file["id"] + else: + break + return file_id + + def exists(self, path_info): + return self.get_path_id(path_info) != "" + + def batch_exists(self, path_infos, callback): + results = [] + for path_info in path_infos: + results.append(self.exists(path_info)) + callback.update(str(path_info)) + return results + + def _upload(self, from_file, to_info, name, no_progress_bar): + + dirname = to_info.parent + if dirname: + parent_id = self.get_path_id(dirname, True) + else: + parent_id = to_info.netloc + + file1 = self.gdrive.CreateFile( + {"title": to_info.name, "parents": [{"id": parent_id}]} + ) + + from_file = open(from_file, "rb") + if not no_progress_bar: + from_file = TrackFileReadProgress(name, from_file) + + file1.content = from_file + file1.Upload() + from_file.close() + + def _download(self, from_info, to_file, name, no_progress_bar): + file_id = self.get_path_id(from_info) + gdrive_file = self.gdrive.CreateFile({"id": file_id}) + gdrive_file.GetContentFile(to_file) + #if not no_progress_bar: + # progress.update_target(name, 1, 1) diff --git a/dvc/remote/gdrive/settings.yaml b/dvc/remote/gdrive/settings.yaml new file mode 100644 index 0000000000..02fa37f5cc --- /dev/null +++ b/dvc/remote/gdrive/settings.yaml @@ -0,0 +1,14 @@ +client_config_backend: settings +client_config: + client_id: 719861249063-v4an78j9grdtuuuqg3lnm0sugna6v3lh.apps.googleusercontent.com + client_secret: 2fy_HyzSwkxkGzEken7hThXb + +save_credentials: True +save_credentials_backend: file +save_credentials_file: credentials.json + +get_refresh_token: True + +oauth_scope: + - https://www.googleapis.com/auth/drive + - https://www.googleapis.com/auth/drive.appdata \ No newline at end of file diff --git a/dvc/remote/gdrive/utils.py b/dvc/remote/gdrive/utils.py new file mode 100644 index 0000000000..44998ff7fc --- /dev/null +++ b/dvc/remote/gdrive/utils.py @@ -0,0 +1,58 @@ +import functools +import os +import threading +import logging + +from dvc.progress import Tqdm + + +logger = logging.getLogger(__name__) + + +MIME_GOOGLE_APPS_FOLDER = "application/vnd.google-apps.folder" + + +class TrackFileReadProgress(object): + def __init__(self, progress_name, fobj): + self.progress_name = progress_name + self.fobj = fobj + self.file_size = os.fstat(fobj.fileno()).st_size + + def read(self, size): + #progress.update_target( + # self.progress_name, self.fobj.tell(), self.file_size + #) + return self.fobj.read(size) + + def __getattr__(self, attr): + return getattr(self.fobj, attr) + + +def only_once(func): + lock = threading.Lock() + locks = {} + results = {} + + @functools.wraps(func) + def wrapped(*args, **kwargs): + key = (args, tuple(kwargs.items())) + # could do with just setdefault, but it would require + # create/delete a "default" Lock() object for each call, so it + # is better to lock a single one for a short time + with lock: + if key not in locks: + locks[key] = threading.Lock() + with locks[key]: + if key not in results: + results[key] = func(*args, **kwargs) + return results[key] + + return wrapped + + +@only_once +def shared_token_warning(): + logger.warning( + "Warning: a shared GoogleAPI token is in use. " + "Please create your own token." + ) diff --git a/dvc/scheme.py b/dvc/scheme.py index e12b768f58..5f7a8d1a28 100644 --- a/dvc/scheme.py +++ b/dvc/scheme.py @@ -9,5 +9,6 @@ class Schemes: HTTP = "http" HTTPS = "https" GS = "gs" + GDRIVE = "gdrive" LOCAL = "local" OSS = "oss" diff --git a/scripts/ci/gdrive-oauth2/068b8e92002dd24414a9995a80726a14.enc b/scripts/ci/gdrive-oauth2/068b8e92002dd24414a9995a80726a14.enc new file mode 100644 index 0000000000000000000000000000000000000000..3a47a817242c55fc62c155bd465182fabf1f2b6a GIT binary patch literal 496 zcmVwBUS14r_Jbh#r1M8Yfl zJR0t-`c|~d>*|XEWVhe#czo-+u{0F&!c}~E!xAuuaTc@!E`+t}w?c{HMxPIJ<-Xg{ zbm6M`BwJ~A>WkIO>Q93LpuB^0nYO<7V$Y&Nw6FQ+ph>gw7U1q6(D7>lssE_L#~s77 zitG=j4{2MU7vS{BB2xEy9Bw62*vGv;qyuGkuAo#fuphY z9sdP>Zb?1RONN!Yh&)C4uGX*s(U1%+2VstyDGAeL?Ah}cO=~vJf`G8HJnJ;x#YceW z!T|j`vkBZ3#NykOHO?LghtR-hzWS_|{RfzAhe13!=jY~$x%_2P&(uzSxT5??sIQjD mV^mcdlLM9gYJQ}cUq~1lwa1kx6sD2ja^fXKgRWeEqP+-3WCL*k literal 0 HcmV?d00001 diff --git a/scripts/ci/gdrive-oauth2/589e2f63a0de57566be6c247074399db.enc b/scripts/ci/gdrive-oauth2/589e2f63a0de57566be6c247074399db.enc new file mode 100644 index 0000000000000000000000000000000000000000..87cf3f17245f37025cd67b408969d4483fb8112f GIT binary patch literal 496 zcmVOU!sb z)36R=yVQ?a?_2bM5FtVmLy)C6>|{lhAz`+JSiHi}&PBufkQ%nykNQ`&&yZHoc>_iy9m3~Y z8QD)xKLgRSmv~ny*{5~E5(O&s>u67<HDpwK*KR@5k{x=}iK$~`YcUYa3r!+Z z$>k6~5&4%+Zs~h9v4|Vi7nQC%1_5MpX&(3i=w<%*HupR zISF`}WfZ(u8}Y=bZTYscvFuii5CDj9%-9(Vp;K#pQ=`HVB`~%V5#}*p+^_F8y;d|% z4g9AHTyWJs)Mx7TC!`M{OczF*ZBTn;=7Y~kH4wvn0CGSzKJ%7EAzlw-aT>%nAf=$D zC5n;`!!ir|s^=ok#6DG5A-5{O4~X@>;yp2X6{o73a+u#O3t(2)Nr=4OSqB_uJt-~W m2`YK@k5$LYTew=>I=L+#w<^o-SXQU|{a^-9oN~q{4bTWSQ0wae literal 0 HcmV?d00001 diff --git a/setup.py b/setup.py index 47bd6c46bb..e1e79920cf 100644 --- a/setup.py +++ b/setup.py @@ -86,6 +86,7 @@ def run(self): # Extra dependencies for remote integrations gs = ["google-cloud-storage==1.19.0"] +gdrive = ["pydrive==1.3.1"] s3 = ["boto3==1.9.115"] azure = ["azure-storage-blob==2.1.0"] oss = ["oss2==2.6.1"] @@ -118,6 +119,7 @@ def run(self): "xmltodict>=0.11.0", "awscli>=1.16.125", "google-compute-engine==2.8.13", + "pydrive>=1.3.1", "pywin32; sys_platform == 'win32'", "Pygments", # required by collective.checkdocs, "collective.checkdocs", @@ -147,6 +149,7 @@ def run(self): extras_require={ "all": all_remotes, "gs": gs, + "gdrive": gdrive, "s3": s3, "azure": azure, "oss": oss, diff --git a/tests/conftest.py b/tests/conftest.py index 9ca00335fb..2670bb3a6b 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -6,6 +6,7 @@ from git.exc import GitCommandNotFound from dvc.remote.config import RemoteConfig +from dvc.remote.gdrive import RemoteGDrive from dvc.utils.compat import cast_bytes_py2 from dvc.remote.ssh.connection import SSHConnection from dvc.repo import Repo as DvcRepo @@ -17,6 +18,14 @@ os.environ[cast_bytes_py2("DVC_IGNORE_ISATTY")] = cast_bytes_py2("true") +# Make DVC tests use separate OAuth token to access Google Drive +def skip_pydrive_init(self): + pass + + +RemoteGDrive.init_gdrive = skip_pydrive_init + + @pytest.fixture(autouse=True) def reset_loglevel(request, caplog): """ diff --git a/tests/func/test_data_cloud.py b/tests/func/test_data_cloud.py index 6ceeb07bb3..aabe9e544a 100644 --- a/tests/func/test_data_cloud.py +++ b/tests/func/test_data_cloud.py @@ -21,6 +21,7 @@ from dvc.remote import ( RemoteS3, RemoteGS, + RemoteGDrive, RemoteAZURE, RemoteOSS, RemoteLOCAL, @@ -69,6 +70,14 @@ def _should_test_aws(): return False +def _should_test_gdrive(): + if os.getenv("DVC_TEST_GDRIVE") == "true": + return True + elif os.getenv("DVC_TEST_GDRIVE") == "false": + return False + return True + + def _should_test_gcp(): do_test = env2bool("DVC_TEST_GCP", undefined=None) if do_test is not None: @@ -201,6 +210,10 @@ def get_aws_url(): return "s3://" + get_aws_storagepath() +def get_gdrive_url(): + return "gdrive://root/" + str(uuid.uuid4()) + + def get_gcp_storagepath(): return TEST_GCP_REPO_BUCKET + "/" + str(uuid.uuid4()) @@ -234,6 +247,7 @@ def test(self): clist = [ ("s3://mybucket/", RemoteS3), + ("gdrive://root/", RemoteGDrive), ("gs://mybucket/", RemoteGS), ("ssh://user@localhost:/", RemoteSSH), ("http://localhost:8000/", RemoteHTTP), @@ -374,6 +388,17 @@ def _get_cloud_class(self): return RemoteS3 +class TestRemoteGDrive(TestDataCloudBase): + def _should_test(self): + return _should_test_gdrive() + + def _get_url(self): + return get_gdrive_url() + + def _get_cloud_class(self): + return RemoteGDrive + + class TestRemoteGS(TestDataCloudBase): def _should_test(self): return _should_test_gcp() @@ -620,6 +645,19 @@ def _test(self): self._test_cloud(TEST_REMOTE) +class TestRemoteGDriveCLI(TestDataCloudCLIBase): + def _should_test(self): + return _should_test_gdrive() + + def _test(self): + url = get_gdrive_url() + + self.main(["remote", "add", TEST_REMOTE, url]) + self.main(["remote", "modify", TEST_REMOTE, "oauth_id", "test"]) + + self._test_cloud(TEST_REMOTE) + + class TestRemoteGSCLI(TestDataCloudCLIBase): def _should_test(self): return _should_test_gcp() diff --git a/tests/func/test_gdrive.py b/tests/func/test_gdrive.py new file mode 100644 index 0000000000..7f57c261be --- /dev/null +++ b/tests/func/test_gdrive.py @@ -0,0 +1,79 @@ +from subprocess import check_call +import shutil +import os +import tempfile + +import pytest + +from dvc.main import main +from dvc.remote.gdrive import RemoteGDrive +from dvc.remote.gdrive.client import GDriveClient + + +if os.getenv("DVC_TEST_GDRIVE") != "true": + pytest.skip("Skipping long GDrive tests", allow_module_level=True) + + +client = GDriveClient( + "drive", + "test", + RemoteGDrive.DEFAULT_CREDENTIALPATH, + [RemoteGDrive.SCOPE_DRIVE], + "console", +) +root_id = client.request("GET", "drive/v3/files/root").json()["id"] + + +@pytest.mark.parametrize( + "base_url", + ["gdrive://root/", "gdrive://" + root_id + "/", "gdrive://appDataFolder/"], +) +def test_gdrive_push_pull(repo_dir, dvc_repo, base_url): + + dirname = tempfile.mktemp("", "dvc_test_", "") + url = base_url + dirname + files = [repo_dir.FOO, repo_dir.DATA_SUB.split(os.path.sep)[0]] + + gdrive = RemoteGDrive(dvc_repo, {"url": url}) + + # push files + check_call(["dvc", "add"] + files) + check_call(["dvc", "remote", "add", "gdrive", url]) + check_call(["dvc", "remote", "modify", "gdrive", "oauth_id", "test"]) + assert main(["push", "-r", "gdrive"]) == 0 + + paths = dvc_repo.cache.local.list_cache_paths() + paths = [i.parts[-2:] for i in paths] + + # check that files are correctly uploaded + testdir_meta = gdrive.gdrive.get_metadata(gdrive.path_info) + q = "'{}' in parents".format(testdir_meta["id"]) + found = list(gdrive.client.search(add_params={"q": q})) + assert set(i["name"] for i in found) == set([i[0] for i in paths]) + q = " or ".join("'{}' in parents".format(i["id"]) for i in found) + found = list(gdrive.client.search(add_params={"q": q})) + assert set(i["name"] for i in found) == set(i[1] for i in paths) + + # remove cache and files + shutil.rmtree(".dvc/cache") + for i in files: + if os.path.isdir(i): + shutil.rmtree(i) + else: + os.remove(i) + + # check that they are in list_cache_paths + assert set(gdrive.list_cache_paths()) == { + "/".join([dirname] + list(i)) for i in paths + } + + # pull them back from remote + assert main(["pull", "-r", "gdrive"]) == 0 + + assert set(files) < set(os.listdir(".")) + + # remove the temporary directory on Google Drive + resp = gdrive.gdrive.request( + "DELETE", "drive/v3/files/" + testdir_meta["id"] + ) + print("Delete temp dir: HTTP {}".format(resp.status_code)) diff --git a/tests/unit/remote/gdrive/__init__.py b/tests/unit/remote/gdrive/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/unit/remote/gdrive/conftest.py b/tests/unit/remote/gdrive/conftest.py new file mode 100644 index 0000000000..4f9c8a3d21 --- /dev/null +++ b/tests/unit/remote/gdrive/conftest.py @@ -0,0 +1,15 @@ +import pytest + +from dvc.repo import Repo +from dvc.remote.gdrive import RemoteGDrive + + +@pytest.fixture() +def repo(): + return Repo(".") + + +@pytest.fixture +def gdrive(repo): + ret = RemoteGDrive(repo, {"url": "gdrive://root/data"}) + return ret diff --git a/tests/unit/remote/gdrive/test_gdrive.py b/tests/unit/remote/gdrive/test_gdrive.py new file mode 100644 index 0000000000..ec1e2a3446 --- /dev/null +++ b/tests/unit/remote/gdrive/test_gdrive.py @@ -0,0 +1,13 @@ +from dvc.remote.gdrive import RemoteGDrive + + +def test_init_drive(repo): + url = "gdrive://root/data" + gdrive = RemoteGDrive(repo, {"url": url}) + assert str(gdrive.path_info) == url + + +def test_init_folder_id(repo): + url = "gdrive://folder_id/data" + gdrive = RemoteGDrive(repo, {"url": url}) + assert str(gdrive.path_info) == url diff --git a/tests/unit/remote/gdrive/test_utils.py b/tests/unit/remote/gdrive/test_utils.py new file mode 100644 index 0000000000..e69de29bb2 From 3f5b22f35ac1e777597f67c22a3c9dbca4a60a3b Mon Sep 17 00:00:00 2001 From: Max Risuhin Date: Wed, 2 Oct 2019 06:19:59 -0700 Subject: [PATCH 02/33] Support upload progress bar with Tqdm --- dvc/remote/gdrive/__init__.py | 4 ++-- dvc/remote/gdrive/utils.py | 16 +++++++++++++--- 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/dvc/remote/gdrive/__init__.py b/dvc/remote/gdrive/__init__.py index f11734bdc0..a44d0f6d4a 100644 --- a/dvc/remote/gdrive/__init__.py +++ b/dvc/remote/gdrive/__init__.py @@ -15,7 +15,6 @@ from dvc.remote.base import RemoteBASE from dvc.config import Config from dvc.remote.gdrive.utils import TrackFileReadProgress -from dvc.progress import Tqdm logger = logging.getLogger(__name__) @@ -133,6 +132,7 @@ def _upload(self, from_file, to_info, name, no_progress_bar): from_file = TrackFileReadProgress(name, from_file) file1.content = from_file + file1.Upload() from_file.close() @@ -140,5 +140,5 @@ def _download(self, from_info, to_file, name, no_progress_bar): file_id = self.get_path_id(from_info) gdrive_file = self.gdrive.CreateFile({"id": file_id}) gdrive_file.GetContentFile(to_file) - #if not no_progress_bar: + # if not no_progress_bar: # progress.update_target(name, 1, 1) diff --git a/dvc/remote/gdrive/utils.py b/dvc/remote/gdrive/utils.py index 44998ff7fc..b0c051c087 100644 --- a/dvc/remote/gdrive/utils.py +++ b/dvc/remote/gdrive/utils.py @@ -13,17 +13,27 @@ class TrackFileReadProgress(object): + UPDATE_AFTER_READ_COUNT = 30 + def __init__(self, progress_name, fobj): self.progress_name = progress_name self.fobj = fobj self.file_size = os.fstat(fobj.fileno()).st_size + self.tqdm = Tqdm(desc=self.progress_name, total=self.file_size) + self.update_counter = 0 def read(self, size): - #progress.update_target( - # self.progress_name, self.fobj.tell(), self.file_size - #) + if self.update_counter == 0: + self.tqdm.update_to(self.fobj.tell()) + self.update_counter = self.UPDATE_AFTER_READ_COUNT + else: + self.update_counter -= 1 return self.fobj.read(size) + def close(self): + self.fobj.close() + self.tqdm.close() + def __getattr__(self, attr): return getattr(self.fobj, attr) From 33ac722ca4b252f8ed679e92a5332da2324b1e16 Mon Sep 17 00:00:00 2001 From: Max Risuhin Date: Thu, 3 Oct 2019 08:34:02 -0700 Subject: [PATCH 03/33] Fix found issues --- dvc/remote/gdrive/__init__.py | 58 +++++++++++++++---------- dvc/remote/gdrive/utils.py | 4 +- tests/conftest.py | 2 +- tests/func/test_data_cloud.py | 3 +- tests/func/test_gdrive.py | 79 ----------------------------------- 5 files changed, 40 insertions(+), 106 deletions(-) delete mode 100644 tests/func/test_gdrive.py diff --git a/dvc/remote/gdrive/__init__.py b/dvc/remote/gdrive/__init__.py index a44d0f6d4a..20051eb73f 100644 --- a/dvc/remote/gdrive/__init__.py +++ b/dvc/remote/gdrive/__init__.py @@ -1,7 +1,6 @@ from __future__ import unicode_literals import os -import logging try: from pydrive.auth import GoogleAuth @@ -17,9 +16,6 @@ from dvc.remote.gdrive.utils import TrackFileReadProgress -logger = logging.getLogger(__name__) - - class GDriveURLInfo(CloudURLInfo): @property def netloc(self): @@ -53,6 +49,10 @@ def drive(self): gauth.CommandLineAuth() return GoogleDrive(gauth) + def cache_dirs(self, dirs_list): + for dir1 in dirs_list: + self.root_dirs_list[dir1["title"]] = dir1["id"] + def cache_root_content(self): if not self.root_content_cached: for dirs_list in self.gdrive.ListFile( @@ -62,33 +62,23 @@ def cache_root_content(self): "maxResults": 256, } ): - for dir1 in dirs_list: - self.root_dirs_list[dir1["title"]] = dir1["id"] + self.cache_dirs(dirs_list) self.root_content_cached = True - def get_path_id(self, path_info, create=False): - file_id = "" - parts = path_info.path.split("/") - - if parts and (parts[0] in self.root_dirs_list): - parent_id = self.root_dirs_list[parts[0]] - file_id = self.root_dirs_list[parts[0]] - parts.pop(0) - else: - parent_id = path_info.netloc + def resolve_file_id(self, file_id, parent_id, path_parts, create): file_list = self.gdrive.ListFile( {"q": "'%s' in parents and trashed=false" % parent_id} ).GetList() - for part in parts: + for part in path_parts: file_id = "" - for f in file_list: - if f["title"] == part: - file_id = f["id"] + for file1 in file_list: + if file1["title"] == part: + file_id = file1["id"] file_list = self.gdrive.ListFile( {"q": "'%s' in parents and trashed=false" % file_id} ).GetList() - parent_id = f["id"] + parent_id = file1["id"] break if file_id == "": if create: @@ -105,6 +95,19 @@ def get_path_id(self, path_info, create=False): break return file_id + def get_path_id(self, path_info, create=False): + file_id = "" + parts = path_info.path.split("/") + + if parts and (parts[0] in self.root_dirs_list): + parent_id = self.root_dirs_list[parts[0]] + file_id = self.root_dirs_list[parts[0]] + parts.pop(0) + else: + parent_id = path_info.netloc + + return self.resolve_file_id(file_id, parent_id, parts, create) + def exists(self, path_info): return self.get_path_id(path_info) != "" @@ -136,9 +139,20 @@ def _upload(self, from_file, to_info, name, no_progress_bar): file1.Upload() from_file.close() - def _download(self, from_info, to_file, name, no_progress_bar): + def _download( + self, from_info, to_file, _unused_name, _unused_no_progress_bar + ): file_id = self.get_path_id(from_info) gdrive_file = self.gdrive.CreateFile({"id": file_id}) gdrive_file.GetContentFile(to_file) # if not no_progress_bar: # progress.update_target(name, 1, 1) + + def get_file_checksum(self, path_info): + raise NotImplementedError + + def list_cache_paths(self): + raise NotImplementedError + + def walk(self, path_info): + raise NotImplementedError diff --git a/dvc/remote/gdrive/utils.py b/dvc/remote/gdrive/utils.py index b0c051c087..5331165640 100644 --- a/dvc/remote/gdrive/utils.py +++ b/dvc/remote/gdrive/utils.py @@ -6,7 +6,7 @@ from dvc.progress import Tqdm -logger = logging.getLogger(__name__) +LOGGER = logging.getLogger(__name__) MIME_GOOGLE_APPS_FOLDER = "application/vnd.google-apps.folder" @@ -62,7 +62,7 @@ def wrapped(*args, **kwargs): @only_once def shared_token_warning(): - logger.warning( + LOGGER.warning( "Warning: a shared GoogleAPI token is in use. " "Please create your own token." ) diff --git a/tests/conftest.py b/tests/conftest.py index 2670bb3a6b..f1f8c90ce5 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -19,7 +19,7 @@ # Make DVC tests use separate OAuth token to access Google Drive -def skip_pydrive_init(self): +def skip_pydrive_init(_): pass diff --git a/tests/func/test_data_cloud.py b/tests/func/test_data_cloud.py index aabe9e544a..d45a396a2d 100644 --- a/tests/func/test_data_cloud.py +++ b/tests/func/test_data_cloud.py @@ -75,7 +75,7 @@ def _should_test_gdrive(): return True elif os.getenv("DVC_TEST_GDRIVE") == "false": return False - return True + return False def _should_test_gcp(): @@ -653,7 +653,6 @@ def _test(self): url = get_gdrive_url() self.main(["remote", "add", TEST_REMOTE, url]) - self.main(["remote", "modify", TEST_REMOTE, "oauth_id", "test"]) self._test_cloud(TEST_REMOTE) diff --git a/tests/func/test_gdrive.py b/tests/func/test_gdrive.py deleted file mode 100644 index 7f57c261be..0000000000 --- a/tests/func/test_gdrive.py +++ /dev/null @@ -1,79 +0,0 @@ -from subprocess import check_call -import shutil -import os -import tempfile - -import pytest - -from dvc.main import main -from dvc.remote.gdrive import RemoteGDrive -from dvc.remote.gdrive.client import GDriveClient - - -if os.getenv("DVC_TEST_GDRIVE") != "true": - pytest.skip("Skipping long GDrive tests", allow_module_level=True) - - -client = GDriveClient( - "drive", - "test", - RemoteGDrive.DEFAULT_CREDENTIALPATH, - [RemoteGDrive.SCOPE_DRIVE], - "console", -) -root_id = client.request("GET", "drive/v3/files/root").json()["id"] - - -@pytest.mark.parametrize( - "base_url", - ["gdrive://root/", "gdrive://" + root_id + "/", "gdrive://appDataFolder/"], -) -def test_gdrive_push_pull(repo_dir, dvc_repo, base_url): - - dirname = tempfile.mktemp("", "dvc_test_", "") - url = base_url + dirname - files = [repo_dir.FOO, repo_dir.DATA_SUB.split(os.path.sep)[0]] - - gdrive = RemoteGDrive(dvc_repo, {"url": url}) - - # push files - check_call(["dvc", "add"] + files) - check_call(["dvc", "remote", "add", "gdrive", url]) - check_call(["dvc", "remote", "modify", "gdrive", "oauth_id", "test"]) - assert main(["push", "-r", "gdrive"]) == 0 - - paths = dvc_repo.cache.local.list_cache_paths() - paths = [i.parts[-2:] for i in paths] - - # check that files are correctly uploaded - testdir_meta = gdrive.gdrive.get_metadata(gdrive.path_info) - q = "'{}' in parents".format(testdir_meta["id"]) - found = list(gdrive.client.search(add_params={"q": q})) - assert set(i["name"] for i in found) == set([i[0] for i in paths]) - q = " or ".join("'{}' in parents".format(i["id"]) for i in found) - found = list(gdrive.client.search(add_params={"q": q})) - assert set(i["name"] for i in found) == set(i[1] for i in paths) - - # remove cache and files - shutil.rmtree(".dvc/cache") - for i in files: - if os.path.isdir(i): - shutil.rmtree(i) - else: - os.remove(i) - - # check that they are in list_cache_paths - assert set(gdrive.list_cache_paths()) == { - "/".join([dirname] + list(i)) for i in paths - } - - # pull them back from remote - assert main(["pull", "-r", "gdrive"]) == 0 - - assert set(files) < set(os.listdir(".")) - - # remove the temporary directory on Google Drive - resp = gdrive.gdrive.request( - "DELETE", "drive/v3/files/" + testdir_meta["id"] - ) - print("Delete temp dir: HTTP {}".format(resp.status_code)) From f1ad91cae5e88ce346de9647f2e9b9d688f80e92 Mon Sep 17 00:00:00 2001 From: Max Risuhin Date: Thu, 3 Oct 2019 09:57:16 -0700 Subject: [PATCH 04/33] More fixes --- dvc/remote/gdrive/__init__.py | 52 +++++++++++------- dvc/remote/gdrive/settings.yaml | 4 +- .../068b8e92002dd24414a9995a80726a14.enc | Bin 496 -> 0 bytes .../589e2f63a0de57566be6c247074399db.enc | Bin 496 -> 0 bytes 4 files changed, 33 insertions(+), 23 deletions(-) delete mode 100644 scripts/ci/gdrive-oauth2/068b8e92002dd24414a9995a80726a14.enc delete mode 100644 scripts/ci/gdrive-oauth2/589e2f63a0de57566be6c247074399db.enc diff --git a/dvc/remote/gdrive/__init__.py b/dvc/remote/gdrive/__init__.py index 20051eb73f..d93a7cd23a 100644 --- a/dvc/remote/gdrive/__init__.py +++ b/dvc/remote/gdrive/__init__.py @@ -65,34 +65,44 @@ def cache_root_content(self): self.cache_dirs(dirs_list) self.root_content_cached = True + def resolve_file_id_from_part(self, part, parent_id, file_list): + file_id = "" + for file1 in file_list: + if file1["title"] == part: + file_id = file1["id"] + file_list = self.gdrive.ListFile( + {"q": "'%s' in parents and trashed=false" % file_id} + ).GetList() + parent_id = file1["id"] + break + return file_id, parent_id, file_list + + def create_file_id(self, file_id, parent_id, part, create): + if file_id == "": + if create: + gdrive_file = self.gdrive.CreateFile( + { + "title": part, + "parents": [{"id": parent_id}], + "mimeType": "application/vnd.google-apps.folder", + } + ) + gdrive_file.Upload() + file_id = gdrive_file["id"] + return file_id + def resolve_file_id(self, file_id, parent_id, path_parts, create): file_list = self.gdrive.ListFile( {"q": "'%s' in parents and trashed=false" % parent_id} ).GetList() for part in path_parts: - file_id = "" - for file1 in file_list: - if file1["title"] == part: - file_id = file1["id"] - file_list = self.gdrive.ListFile( - {"q": "'%s' in parents and trashed=false" % file_id} - ).GetList() - parent_id = file1["id"] - break + file_id, parent_id, file_list = self.resolve_file_id_from_part( + part, parent_id, file_list + ) + file_id = self.create_file_id(file_id, parent_id, part, create) if file_id == "": - if create: - gdrive_file = self.gdrive.CreateFile( - { - "title": part, - "parents": [{"id": parent_id}], - "mimeType": "application/vnd.google-apps.folder", - } - ) - gdrive_file.Upload() - file_id = gdrive_file["id"] - else: - break + break return file_id def get_path_id(self, path_info, create=False): diff --git a/dvc/remote/gdrive/settings.yaml b/dvc/remote/gdrive/settings.yaml index 02fa37f5cc..59c06c7efa 100644 --- a/dvc/remote/gdrive/settings.yaml +++ b/dvc/remote/gdrive/settings.yaml @@ -1,7 +1,7 @@ client_config_backend: settings client_config: - client_id: 719861249063-v4an78j9grdtuuuqg3lnm0sugna6v3lh.apps.googleusercontent.com - client_secret: 2fy_HyzSwkxkGzEken7hThXb + client_id: 470227652556-1n09ue25mtb7gp66i4lvks3jompjisen.apps.googleusercontent.com + client_secret: 0ipFt0Dn4V_Tge6kw7aiu0GR save_credentials: True save_credentials_backend: file diff --git a/scripts/ci/gdrive-oauth2/068b8e92002dd24414a9995a80726a14.enc b/scripts/ci/gdrive-oauth2/068b8e92002dd24414a9995a80726a14.enc deleted file mode 100644 index 3a47a817242c55fc62c155bd465182fabf1f2b6a..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 496 zcmVwBUS14r_Jbh#r1M8Yfl zJR0t-`c|~d>*|XEWVhe#czo-+u{0F&!c}~E!xAuuaTc@!E`+t}w?c{HMxPIJ<-Xg{ zbm6M`BwJ~A>WkIO>Q93LpuB^0nYO<7V$Y&Nw6FQ+ph>gw7U1q6(D7>lssE_L#~s77 zitG=j4{2MU7vS{BB2xEy9Bw62*vGv;qyuGkuAo#fuphY z9sdP>Zb?1RONN!Yh&)C4uGX*s(U1%+2VstyDGAeL?Ah}cO=~vJf`G8HJnJ;x#YceW z!T|j`vkBZ3#NykOHO?LghtR-hzWS_|{RfzAhe13!=jY~$x%_2P&(uzSxT5??sIQjD mV^mcdlLM9gYJQ}cUq~1lwa1kx6sD2ja^fXKgRWeEqP+-3WCL*k diff --git a/scripts/ci/gdrive-oauth2/589e2f63a0de57566be6c247074399db.enc b/scripts/ci/gdrive-oauth2/589e2f63a0de57566be6c247074399db.enc deleted file mode 100644 index 87cf3f17245f37025cd67b408969d4483fb8112f..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 496 zcmVOU!sb z)36R=yVQ?a?_2bM5FtVmLy)C6>|{lhAz`+JSiHi}&PBufkQ%nykNQ`&&yZHoc>_iy9m3~Y z8QD)xKLgRSmv~ny*{5~E5(O&s>u67<HDpwK*KR@5k{x=}iK$~`YcUYa3r!+Z z$>k6~5&4%+Zs~h9v4|Vi7nQC%1_5MpX&(3i=w<%*HupR zISF`}WfZ(u8}Y=bZTYscvFuii5CDj9%-9(Vp;K#pQ=`HVB`~%V5#}*p+^_F8y;d|% z4g9AHTyWJs)Mx7TC!`M{OczF*ZBTn;=7Y~kH4wvn0CGSzKJ%7EAzlw-aT>%nAf=$D zC5n;`!!ir|s^=ok#6DG5A-5{O4~X@>;yp2X6{o73a+u#O3t(2)Nr=4OSqB_uJt-~W m2`YK@k5$LYTew=>I=L+#w<^o-SXQU|{a^-9oN~q{4bTWSQ0wae From e2890f18ec0e281daacfef2b514f9454dc47c14c Mon Sep 17 00:00:00 2001 From: Max Risuhin Date: Thu, 3 Oct 2019 10:04:21 -0700 Subject: [PATCH 05/33] Remove unneccessary condition --- tests/func/test_data_cloud.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/func/test_data_cloud.py b/tests/func/test_data_cloud.py index d45a396a2d..72ee9fdff4 100644 --- a/tests/func/test_data_cloud.py +++ b/tests/func/test_data_cloud.py @@ -73,8 +73,6 @@ def _should_test_aws(): def _should_test_gdrive(): if os.getenv("DVC_TEST_GDRIVE") == "true": return True - elif os.getenv("DVC_TEST_GDRIVE") == "false": - return False return False From f45253868b01c364dfc04e523537be5df052b418 Mon Sep 17 00:00:00 2001 From: Max Risuhin Date: Sun, 6 Oct 2019 09:13:50 -0700 Subject: [PATCH 06/33] Enclose dependencies imports inside property --- dvc/remote/gdrive/__init__.py | 38 +++++++++++++++------------------ dvc/remote/gdrive/settings.yaml | 3 ++- 2 files changed, 19 insertions(+), 22 deletions(-) diff --git a/dvc/remote/gdrive/__init__.py b/dvc/remote/gdrive/__init__.py index d93a7cd23a..b49ff28a8d 100644 --- a/dvc/remote/gdrive/__init__.py +++ b/dvc/remote/gdrive/__init__.py @@ -2,13 +2,6 @@ import os -try: - from pydrive.auth import GoogleAuth - from pydrive.drive import GoogleDrive -except ImportError: - GoogleAuth = None - GoogleDrive = None - from dvc.scheme import Schemes from dvc.path_info import CloudURLInfo from dvc.remote.base import RemoteBASE @@ -37,17 +30,20 @@ def __init__(self, repo, config): self.path_info = self.path_cls(config[Config.SECTION_REMOTE_URL]) self.root_content_cached = False self.root_dirs_list = {} - self.init_gdrive() - - def init_gdrive(self): - self.gdrive = self.drive() + self._gdrive = None self.cache_root_content() + @property def drive(self): - GoogleAuth.DEFAULT_SETTINGS["client_config_backend"] = "settings" - gauth = GoogleAuth(settings_file=self.GOOGLE_AUTH_SETTINGS_PATH) - gauth.CommandLineAuth() - return GoogleDrive(gauth) + from pydrive.auth import GoogleAuth + from pydrive.drive import GoogleDrive + + if self._gdrive is None: + GoogleAuth.DEFAULT_SETTINGS["client_config_backend"] = "settings" + gauth = GoogleAuth(settings_file=self.GOOGLE_AUTH_SETTINGS_PATH) + gauth.CommandLineAuth() + self._gdrive = GoogleDrive(gauth) + return self._gdrive def cache_dirs(self, dirs_list): for dir1 in dirs_list: @@ -55,7 +51,7 @@ def cache_dirs(self, dirs_list): def cache_root_content(self): if not self.root_content_cached: - for dirs_list in self.gdrive.ListFile( + for dirs_list in self.drive.ListFile( { "q": "'%s' in parents and trashed=false" % self.path_info.netloc, @@ -70,7 +66,7 @@ def resolve_file_id_from_part(self, part, parent_id, file_list): for file1 in file_list: if file1["title"] == part: file_id = file1["id"] - file_list = self.gdrive.ListFile( + file_list = self.drive.ListFile( {"q": "'%s' in parents and trashed=false" % file_id} ).GetList() parent_id = file1["id"] @@ -80,7 +76,7 @@ def resolve_file_id_from_part(self, part, parent_id, file_list): def create_file_id(self, file_id, parent_id, part, create): if file_id == "": if create: - gdrive_file = self.gdrive.CreateFile( + gdrive_file = self.drive.CreateFile( { "title": part, "parents": [{"id": parent_id}], @@ -92,7 +88,7 @@ def create_file_id(self, file_id, parent_id, part, create): return file_id def resolve_file_id(self, file_id, parent_id, path_parts, create): - file_list = self.gdrive.ListFile( + file_list = self.drive.ListFile( {"q": "'%s' in parents and trashed=false" % parent_id} ).GetList() @@ -136,7 +132,7 @@ def _upload(self, from_file, to_info, name, no_progress_bar): else: parent_id = to_info.netloc - file1 = self.gdrive.CreateFile( + file1 = self.drive.CreateFile( {"title": to_info.name, "parents": [{"id": parent_id}]} ) @@ -153,7 +149,7 @@ def _download( self, from_info, to_file, _unused_name, _unused_no_progress_bar ): file_id = self.get_path_id(from_info) - gdrive_file = self.gdrive.CreateFile({"id": file_id}) + gdrive_file = self.drive.CreateFile({"id": file_id}) gdrive_file.GetContentFile(to_file) # if not no_progress_bar: # progress.update_target(name, 1, 1) diff --git a/dvc/remote/gdrive/settings.yaml b/dvc/remote/gdrive/settings.yaml index 59c06c7efa..27731eb038 100644 --- a/dvc/remote/gdrive/settings.yaml +++ b/dvc/remote/gdrive/settings.yaml @@ -11,4 +11,5 @@ get_refresh_token: True oauth_scope: - https://www.googleapis.com/auth/drive - - https://www.googleapis.com/auth/drive.appdata \ No newline at end of file + - https://www.googleapis.com/auth/drive.appdata + \ No newline at end of file From 949afc6e1ef87b1cc76b19f3bdb12d660d9df809 Mon Sep 17 00:00:00 2001 From: Max Risuhin Date: Sun, 6 Oct 2019 09:38:31 -0700 Subject: [PATCH 07/33] Use cached property for gdrive object access --- dvc/remote/gdrive/__init__.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/dvc/remote/gdrive/__init__.py b/dvc/remote/gdrive/__init__.py index b49ff28a8d..db58a92800 100644 --- a/dvc/remote/gdrive/__init__.py +++ b/dvc/remote/gdrive/__init__.py @@ -2,6 +2,8 @@ import os +from funcy import cached_property + from dvc.scheme import Schemes from dvc.path_info import CloudURLInfo from dvc.remote.base import RemoteBASE @@ -30,20 +32,17 @@ def __init__(self, repo, config): self.path_info = self.path_cls(config[Config.SECTION_REMOTE_URL]) self.root_content_cached = False self.root_dirs_list = {} - self._gdrive = None self.cache_root_content() - @property + @cached_property def drive(self): from pydrive.auth import GoogleAuth from pydrive.drive import GoogleDrive - if self._gdrive is None: - GoogleAuth.DEFAULT_SETTINGS["client_config_backend"] = "settings" - gauth = GoogleAuth(settings_file=self.GOOGLE_AUTH_SETTINGS_PATH) - gauth.CommandLineAuth() - self._gdrive = GoogleDrive(gauth) - return self._gdrive + GoogleAuth.DEFAULT_SETTINGS["client_config_backend"] = "settings" + gauth = GoogleAuth(settings_file=self.GOOGLE_AUTH_SETTINGS_PATH) + gauth.CommandLineAuth() + return GoogleDrive(gauth) def cache_dirs(self, dirs_list): for dir1 in dirs_list: From 5ec22c828e8a1516d413f8b3ac8aced4a5421309 Mon Sep 17 00:00:00 2001 From: Max Risuhin Date: Wed, 9 Oct 2019 13:34:05 -0700 Subject: [PATCH 08/33] Fix tests; Add default token usage warning --- dvc/config.py | 10 +++-- dvc/remote/gdrive/__init__.py | 58 ++++++++++++++++++++----- dvc/remote/gdrive/settings.yaml | 5 +-- tests/func/test_data_cloud.py | 2 + tests/unit/remote/gdrive/test_gdrive.py | 3 ++ 5 files changed, 62 insertions(+), 16 deletions(-) diff --git a/dvc/config.py b/dvc/config.py index d89b2fc6f0..2ba234fd0f 100644 --- a/dvc/config.py +++ b/dvc/config.py @@ -147,6 +147,8 @@ class Config(object): # pylint: disable=too-many-instance-attributes CONFIG = "config" CONFIG_LOCAL = "config.local" + CREDENTIALPATH = "credentialpath" + LEVEL_LOCAL = 0 LEVEL_REPO = 1 LEVEL_GLOBAL = 2 @@ -215,7 +217,7 @@ class Config(object): # pylint: disable=too-many-instance-attributes # backward compatibility SECTION_AWS = "aws" SECTION_AWS_STORAGEPATH = "storagepath" - SECTION_AWS_CREDENTIALPATH = "credentialpath" + SECTION_AWS_CREDENTIALPATH = CREDENTIALPATH SECTION_AWS_ENDPOINT_URL = "endpointurl" SECTION_AWS_LIST_OBJECTS = "listobjects" SECTION_AWS_REGION = "region" @@ -238,13 +240,15 @@ class Config(object): # pylint: disable=too-many-instance-attributes # backward compatibility SECTION_GCP = "gcp" SECTION_GCP_STORAGEPATH = SECTION_AWS_STORAGEPATH - SECTION_GCP_CREDENTIALPATH = SECTION_AWS_CREDENTIALPATH + SECTION_GCP_CREDENTIALPATH = CREDENTIALPATH SECTION_GCP_PROJECTNAME = "projectname" SECTION_GCP_SCHEMA = { SECTION_GCP_STORAGEPATH: str, Optional(SECTION_GCP_PROJECTNAME): str, } + SECTION_GDRIVE_CREDENTIALPATH = CREDENTIALPATH + # backward compatibility SECTION_LOCAL = "local" SECTION_LOCAL_STORAGEPATH = SECTION_AWS_STORAGEPATH @@ -271,7 +275,7 @@ class Config(object): # pylint: disable=too-many-instance-attributes SECTION_REMOTE_URL: str, Optional(SECTION_AWS_REGION): str, Optional(SECTION_AWS_PROFILE): str, - Optional(SECTION_AWS_CREDENTIALPATH): str, + Optional(CREDENTIALPATH): str, Optional(SECTION_AWS_ENDPOINT_URL): str, Optional(SECTION_AWS_LIST_OBJECTS, default=False): BOOL_SCHEMA, Optional(SECTION_AWS_USE_SSL, default=True): BOOL_SCHEMA, diff --git a/dvc/remote/gdrive/__init__.py b/dvc/remote/gdrive/__init__.py index db58a92800..fb1a1e8ae6 100644 --- a/dvc/remote/gdrive/__init__.py +++ b/dvc/remote/gdrive/__init__.py @@ -8,7 +8,7 @@ from dvc.path_info import CloudURLInfo from dvc.remote.base import RemoteBASE from dvc.config import Config -from dvc.remote.gdrive.utils import TrackFileReadProgress +from dvc.remote.gdrive.utils import TrackFileReadProgress, shared_token_warning class GDriveURLInfo(CloudURLInfo): @@ -23,24 +23,43 @@ class RemoteGDrive(RemoteBASE): REGEX = r"^gdrive://.*$" REQUIRES = {"pydrive": "pydrive"} PARAM_CHECKSUM = "md5Checksum" - GOOGLE_AUTH_SETTINGS_PATH = os.path.join( + DEFAULT_GOOGLE_AUTH_SETTINGS_PATH = os.path.join( os.path.dirname(__file__), "settings.yaml" ) + FOLDER_MIME_TYPE = "application/vnd.google-apps.folder" def __init__(self, repo, config): super(RemoteGDrive, self).__init__(repo, config) + if Config.SECTION_GDRIVE_CREDENTIALPATH not in config: + shared_token_warning() + self.gdrive_credentials_path = config.get( + Config.SECTION_GDRIVE_CREDENTIALPATH, + self.DEFAULT_GOOGLE_AUTH_SETTINGS_PATH, + ) self.path_info = self.path_cls(config[Config.SECTION_REMOTE_URL]) self.root_content_cached = False self.root_dirs_list = {} + self.get_path_id(self.path_info, create=True) self.cache_root_content() @cached_property def drive(self): from pydrive.auth import GoogleAuth from pydrive.drive import GoogleDrive + import logging + + if os.getenv("PYDRIVE_USER_CREDENTIALS_FILE_CONTENT"): + with open("credentials.json", "w") as credentials_file: + credentials_file.write( + os.getenv("PYDRIVE_USER_CREDENTIALS_FILE_CONTENT") + ) + + logging.getLogger("googleapiclient.discovery_cache").setLevel( + logging.ERROR + ) GoogleAuth.DEFAULT_SETTINGS["client_config_backend"] = "settings" - gauth = GoogleAuth(settings_file=self.GOOGLE_AUTH_SETTINGS_PATH) + gauth = GoogleAuth(settings_file=self.gdrive_credentials_path) gauth.CommandLineAuth() return GoogleDrive(gauth) @@ -79,7 +98,7 @@ def create_file_id(self, file_id, parent_id, part, create): { "title": part, "parents": [{"id": parent_id}], - "mimeType": "application/vnd.google-apps.folder", + "mimeType": self.FOLDER_MIME_TYPE, } ) gdrive_file.Upload() @@ -111,6 +130,9 @@ def get_path_id(self, path_info, create=False): else: parent_id = path_info.netloc + if not parts and file_id: + return file_id + return self.resolve_file_id(file_id, parent_id, parts, create) def exists(self, path_info): @@ -144,20 +166,36 @@ def _upload(self, from_file, to_info, name, no_progress_bar): file1.Upload() from_file.close() - def _download( - self, from_info, to_file, _unused_name, _unused_no_progress_bar - ): + def _download(self, from_info, to_file, name, no_progress_bar): + from dvc.progress import Tqdm + file_id = self.get_path_id(from_info) gdrive_file = self.drive.CreateFile({"id": file_id}) + if not no_progress_bar: + tqdm = Tqdm(desc=name, total=int(gdrive_file["fileSize"])) gdrive_file.GetContentFile(to_file) - # if not no_progress_bar: - # progress.update_target(name, 1, 1) + if not no_progress_bar: + tqdm.close() def get_file_checksum(self, path_info): raise NotImplementedError def list_cache_paths(self): - raise NotImplementedError + file_id = self.get_path_id(self.path_info) + prefix = self.path_info.path + for path in self.list_path(file_id): + yield prefix + "/" + path def walk(self, path_info): raise NotImplementedError + + def list_path(self, parent_id): + file_list = self.drive.ListFile( + {"q": "'%s' in parents and trashed=false" % parent_id} + ).GetList() + for file1 in file_list: + if file1["mimeType"] == self.FOLDER_MIME_TYPE: + for i in self.list_path(file1["id"]): + yield file1["title"] + "/" + i + else: + yield file1["title"] diff --git a/dvc/remote/gdrive/settings.yaml b/dvc/remote/gdrive/settings.yaml index 27731eb038..f7d5779666 100644 --- a/dvc/remote/gdrive/settings.yaml +++ b/dvc/remote/gdrive/settings.yaml @@ -1,7 +1,7 @@ client_config_backend: settings client_config: - client_id: 470227652556-1n09ue25mtb7gp66i4lvks3jompjisen.apps.googleusercontent.com - client_secret: 0ipFt0Dn4V_Tge6kw7aiu0GR + client_id: 719861249063-v4an78j9grdtuuuqg3lnm0sugna6v3lh.apps.googleusercontent.com + client_secret: 2fy_HyzSwkxkGzEken7hThXb save_credentials: True save_credentials_backend: file @@ -12,4 +12,3 @@ get_refresh_token: True oauth_scope: - https://www.googleapis.com/auth/drive - https://www.googleapis.com/auth/drive.appdata - \ No newline at end of file diff --git a/tests/func/test_data_cloud.py b/tests/func/test_data_cloud.py index 72ee9fdff4..00314d6724 100644 --- a/tests/func/test_data_cloud.py +++ b/tests/func/test_data_cloud.py @@ -73,6 +73,8 @@ def _should_test_aws(): def _should_test_gdrive(): if os.getenv("DVC_TEST_GDRIVE") == "true": return True + if os.getenv("PYDRIVE_USER_CREDENTIALS_FILE_CONTENT"): + return True return False diff --git a/tests/unit/remote/gdrive/test_gdrive.py b/tests/unit/remote/gdrive/test_gdrive.py index ec1e2a3446..6b35123149 100644 --- a/tests/unit/remote/gdrive/test_gdrive.py +++ b/tests/unit/remote/gdrive/test_gdrive.py @@ -1,12 +1,15 @@ +import mock from dvc.remote.gdrive import RemoteGDrive +@mock.patch("dvc.remote.gdrive.RemoteGDrive.drive") def test_init_drive(repo): url = "gdrive://root/data" gdrive = RemoteGDrive(repo, {"url": url}) assert str(gdrive.path_info) == url +@mock.patch("dvc.remote.gdrive.RemoteGDrive.drive") def test_init_folder_id(repo): url = "gdrive://folder_id/data" gdrive = RemoteGDrive(repo, {"url": url}) From 488ffa3bbfe5d24acd535db49e8e332c12670b20 Mon Sep 17 00:00:00 2001 From: Max Risuhin Date: Fri, 11 Oct 2019 07:43:05 -0700 Subject: [PATCH 09/33] Adjust test_data_cloud gdrive run condition --- tests/func/test_data_cloud.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tests/func/test_data_cloud.py b/tests/func/test_data_cloud.py index 00314d6724..6a48bd7fd0 100644 --- a/tests/func/test_data_cloud.py +++ b/tests/func/test_data_cloud.py @@ -71,10 +71,11 @@ def _should_test_aws(): def _should_test_gdrive(): - if os.getenv("DVC_TEST_GDRIVE") == "true": - return True - if os.getenv("PYDRIVE_USER_CREDENTIALS_FILE_CONTENT"): + if os.getenv("DVC_TEST_GDRIVE") == "true" and os.getenv( + "PYDRIVE_USER_CREDENTIALS_FILE_CONTENT" + ): return True + return False From 525047537d37435306e8dce74c950d198588f84d Mon Sep 17 00:00:00 2001 From: Max Risuhin Date: Fri, 11 Oct 2019 10:41:33 -0700 Subject: [PATCH 10/33] Move remote access out from init --- dvc/remote/gdrive/__init__.py | 69 +++++++++++++++++------------------ tests/func/test_data_cloud.py | 2 +- 2 files changed, 35 insertions(+), 36 deletions(-) diff --git a/dvc/remote/gdrive/__init__.py b/dvc/remote/gdrive/__init__.py index fb1a1e8ae6..c1a7f76ac9 100644 --- a/dvc/remote/gdrive/__init__.py +++ b/dvc/remote/gdrive/__init__.py @@ -37,47 +37,46 @@ def __init__(self, repo, config): self.DEFAULT_GOOGLE_AUTH_SETTINGS_PATH, ) self.path_info = self.path_cls(config[Config.SECTION_REMOTE_URL]) - self.root_content_cached = False - self.root_dirs_list = {} - self.get_path_id(self.path_info, create=True) - self.cache_root_content() + self._drive = None @cached_property + def cached_root_dirs(self): + cached_dirs = {} + for dirs_list in self.drive.ListFile( + { + "q": "'%s' in parents and trashed=false" + % self.path_info.netloc, + "maxResults": 256, + } + ): + for dir1 in dirs_list: + cached_dirs[dir1["title"]] = dir1["id"] + return cached_dirs + + @property def drive(self): from pydrive.auth import GoogleAuth from pydrive.drive import GoogleDrive import logging - if os.getenv("PYDRIVE_USER_CREDENTIALS_FILE_CONTENT"): - with open("credentials.json", "w") as credentials_file: - credentials_file.write( - os.getenv("PYDRIVE_USER_CREDENTIALS_FILE_CONTENT") - ) + if self._drive is None: + if os.getenv("PYDRIVE_USER_CREDENTIALS_FILE_CONTENT"): + with open("credentials.json", "w") as credentials_file: + credentials_file.write( + os.getenv("PYDRIVE_USER_CREDENTIALS_FILE_CONTENT") + ) - logging.getLogger("googleapiclient.discovery_cache").setLevel( - logging.ERROR - ) + logging.getLogger("googleapiclient.discovery_cache").setLevel( + logging.ERROR + ) + + GoogleAuth.DEFAULT_SETTINGS["client_config_backend"] = "settings" + gauth = GoogleAuth(settings_file=self.gdrive_credentials_path) + gauth.CommandLineAuth() + self._drive = GoogleDrive(gauth) - GoogleAuth.DEFAULT_SETTINGS["client_config_backend"] = "settings" - gauth = GoogleAuth(settings_file=self.gdrive_credentials_path) - gauth.CommandLineAuth() - return GoogleDrive(gauth) - - def cache_dirs(self, dirs_list): - for dir1 in dirs_list: - self.root_dirs_list[dir1["title"]] = dir1["id"] - - def cache_root_content(self): - if not self.root_content_cached: - for dirs_list in self.drive.ListFile( - { - "q": "'%s' in parents and trashed=false" - % self.path_info.netloc, - "maxResults": 256, - } - ): - self.cache_dirs(dirs_list) - self.root_content_cached = True + self.get_path_id(self.path_info, create=True) + return self._drive def resolve_file_id_from_part(self, part, parent_id, file_list): file_id = "" @@ -123,9 +122,9 @@ def get_path_id(self, path_info, create=False): file_id = "" parts = path_info.path.split("/") - if parts and (parts[0] in self.root_dirs_list): - parent_id = self.root_dirs_list[parts[0]] - file_id = self.root_dirs_list[parts[0]] + if parts and (parts[0] in self.cached_root_dirs): + parent_id = self.cached_root_dirs[parts[0]] + file_id = self.cached_root_dirs[parts[0]] parts.pop(0) else: parent_id = path_info.netloc diff --git a/tests/func/test_data_cloud.py b/tests/func/test_data_cloud.py index 6a48bd7fd0..7ac0210867 100644 --- a/tests/func/test_data_cloud.py +++ b/tests/func/test_data_cloud.py @@ -71,7 +71,7 @@ def _should_test_aws(): def _should_test_gdrive(): - if os.getenv("DVC_TEST_GDRIVE") == "true" and os.getenv( + if os.getenv("DVC_TEST_GDRIVE") == "true" or os.getenv( "PYDRIVE_USER_CREDENTIALS_FILE_CONTENT" ): return True From df8e9f7dc9d5a1e33bb47d4544adfb73f4c12d49 Mon Sep 17 00:00:00 2001 From: Max Risuhin Date: Fri, 11 Oct 2019 12:05:48 -0700 Subject: [PATCH 11/33] Incorporate ratelimit decorator to call GDrive API --- dvc/remote/gdrive/__init__.py | 9 ++++++++- setup.py | 2 +- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/dvc/remote/gdrive/__init__.py b/dvc/remote/gdrive/__init__.py index c1a7f76ac9..e095675343 100644 --- a/dvc/remote/gdrive/__init__.py +++ b/dvc/remote/gdrive/__init__.py @@ -3,6 +3,7 @@ import os from funcy import cached_property +import ratelimit from dvc.scheme import Schemes from dvc.path_info import CloudURLInfo @@ -54,7 +55,7 @@ def cached_root_dirs(self): return cached_dirs @property - def drive(self): + def raw_drive(self): from pydrive.auth import GoogleAuth from pydrive.drive import GoogleDrive import logging @@ -78,6 +79,12 @@ def drive(self): self.get_path_id(self.path_info, create=True) return self._drive + @property + @ratelimit.sleep_and_retry + @ratelimit.limits(calls=8, period=10) + def drive(self): + return self.raw_drive + def resolve_file_id_from_part(self, part, parent_id, file_list): file_id = "" for file1 in file_list: diff --git a/setup.py b/setup.py index e1e79920cf..45f4d837d5 100644 --- a/setup.py +++ b/setup.py @@ -86,7 +86,7 @@ def run(self): # Extra dependencies for remote integrations gs = ["google-cloud-storage==1.19.0"] -gdrive = ["pydrive==1.3.1"] +gdrive = ["pydrive==1.3.1", "ratelimit==2.2.1"] s3 = ["boto3==1.9.115"] azure = ["azure-storage-blob==2.1.0"] oss = ["oss2==2.6.1"] From dec8c2845992eb9097aac6a91ddb8faeee21f840 Mon Sep 17 00:00:00 2001 From: Max Risuhin Date: Fri, 11 Oct 2019 12:18:30 -0700 Subject: [PATCH 12/33] Missed setup dep --- setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.py b/setup.py index 45f4d837d5..c8e3983d01 100644 --- a/setup.py +++ b/setup.py @@ -120,6 +120,7 @@ def run(self): "awscli>=1.16.125", "google-compute-engine==2.8.13", "pydrive>=1.3.1", + "ratelimit==2.2.1", "pywin32; sys_platform == 'win32'", "Pygments", # required by collective.checkdocs, "collective.checkdocs", From 43078ebfe38f9089e840865f55fb605d26cfe337 Mon Sep 17 00:00:00 2001 From: Max Risuhin Date: Fri, 11 Oct 2019 13:33:10 -0700 Subject: [PATCH 13/33] Refactor drive ListFile query --- dvc/remote/gdrive/__init__.py | 25 ++++++++++++------------- setup.py | 2 -- 2 files changed, 12 insertions(+), 15 deletions(-) diff --git a/dvc/remote/gdrive/__init__.py b/dvc/remote/gdrive/__init__.py index e095675343..3a38eb9c47 100644 --- a/dvc/remote/gdrive/__init__.py +++ b/dvc/remote/gdrive/__init__.py @@ -22,7 +22,7 @@ class RemoteGDrive(RemoteBASE): scheme = Schemes.GDRIVE path_cls = GDriveURLInfo REGEX = r"^gdrive://.*$" - REQUIRES = {"pydrive": "pydrive"} + REQUIRES = {"pydrive": "pydrive", "ratelimit": "ratelimit"} PARAM_CHECKSUM = "md5Checksum" DEFAULT_GOOGLE_AUTH_SETTINGS_PATH = os.path.join( os.path.dirname(__file__), "settings.yaml" @@ -40,18 +40,18 @@ def __init__(self, repo, config): self.path_info = self.path_cls(config[Config.SECTION_REMOTE_URL]) self._drive = None + def list_drive_item(self, query): + for page in self.drive.ListFile({"q": query, "maxResults": 1000}): + for item in page: + yield item + @cached_property def cached_root_dirs(self): cached_dirs = {} - for dirs_list in self.drive.ListFile( - { - "q": "'%s' in parents and trashed=false" - % self.path_info.netloc, - "maxResults": 256, - } + for dir1 in self.list_drive_item( + "'%s' in parents and trashed=false" % self.path_info.netloc ): - for dir1 in dirs_list: - cached_dirs[dir1["title"]] = dir1["id"] + cached_dirs[dir1["title"]] = dir1["id"] return cached_dirs @property @@ -196,10 +196,9 @@ def walk(self, path_info): raise NotImplementedError def list_path(self, parent_id): - file_list = self.drive.ListFile( - {"q": "'%s' in parents and trashed=false" % parent_id} - ).GetList() - for file1 in file_list: + for file1 in self.list_drive_item( + "'%s' in parents and trashed=false" % parent_id + ): if file1["mimeType"] == self.FOLDER_MIME_TYPE: for i in self.list_path(file1["id"]): yield file1["title"] + "/" + i diff --git a/setup.py b/setup.py index c8e3983d01..83cf933edc 100644 --- a/setup.py +++ b/setup.py @@ -119,8 +119,6 @@ def run(self): "xmltodict>=0.11.0", "awscli>=1.16.125", "google-compute-engine==2.8.13", - "pydrive>=1.3.1", - "ratelimit==2.2.1", "pywin32; sys_platform == 'win32'", "Pygments", # required by collective.checkdocs, "collective.checkdocs", From 456cfe1f2b8673914cf48cdd0373affbac9ba4cf Mon Sep 17 00:00:00 2001 From: Max Risuhin Date: Fri, 11 Oct 2019 14:17:27 -0700 Subject: [PATCH 14/33] Fix deps --- dvc/remote/gdrive/__init__.py | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/dvc/remote/gdrive/__init__.py b/dvc/remote/gdrive/__init__.py index 3a38eb9c47..e97ad5500e 100644 --- a/dvc/remote/gdrive/__init__.py +++ b/dvc/remote/gdrive/__init__.py @@ -22,7 +22,7 @@ class RemoteGDrive(RemoteBASE): scheme = Schemes.GDRIVE path_cls = GDriveURLInfo REGEX = r"^gdrive://.*$" - REQUIRES = {"pydrive": "pydrive", "ratelimit": "ratelimit"} + REQUIRES = {"pydrive": "pydrive"} PARAM_CHECKSUM = "md5Checksum" DEFAULT_GOOGLE_AUTH_SETTINGS_PATH = os.path.join( os.path.dirname(__file__), "settings.yaml" diff --git a/setup.py b/setup.py index 83cf933edc..d1d0cf3d15 100644 --- a/setup.py +++ b/setup.py @@ -97,7 +97,7 @@ def run(self): # we can start shipping it by default. ssh_gssapi = ["paramiko[gssapi]>=2.5.0"] hdfs = ["pyarrow==0.14.0"] -all_remotes = gs + s3 + azure + ssh + oss +all_remotes = gs + s3 + azure + ssh + oss + gdrive if os.name != "nt" or sys.version_info[0] != 2: # NOTE: there are no pyarrow wheels for python2 on windows From 0357a5ac52f53002b491a43c354b3f16c22df68e Mon Sep 17 00:00:00 2001 From: Max Risuhin Date: Sat, 12 Oct 2019 04:47:53 -0700 Subject: [PATCH 15/33] Refactor get_path_id --- dvc/remote/gdrive/__init__.py | 75 ++++++++++++++++------------------- 1 file changed, 35 insertions(+), 40 deletions(-) diff --git a/dvc/remote/gdrive/__init__.py b/dvc/remote/gdrive/__init__.py index e97ad5500e..8d4bb81d63 100644 --- a/dvc/remote/gdrive/__init__.py +++ b/dvc/remote/gdrive/__init__.py @@ -67,6 +67,7 @@ def raw_drive(self): os.getenv("PYDRIVE_USER_CREDENTIALS_FILE_CONTENT") ) + # Supress import error on GoogleAuth warning logging.getLogger("googleapiclient.discovery_cache").setLevel( logging.ERROR ) @@ -85,48 +86,42 @@ def raw_drive(self): def drive(self): return self.raw_drive - def resolve_file_id_from_part(self, part, parent_id, file_list): - file_id = "" - for file1 in file_list: - if file1["title"] == part: - file_id = file1["id"] - file_list = self.drive.ListFile( - {"q": "'%s' in parents and trashed=false" % file_id} - ).GetList() - parent_id = file1["id"] - break - return file_id, parent_id, file_list - - def create_file_id(self, file_id, parent_id, part, create): - if file_id == "": - if create: - gdrive_file = self.drive.CreateFile( + def create_drive_item(self, parent_id, title): + item = self.drive.CreateFile( + { + "title": title, + "parents": [{"id": parent_id}], + "mimeType": self.FOLDER_MIME_TYPE, + } + ) + item.Upload() + return item + + def get_drive_item(self, name, parent_id): + return next( + iter( + self.drive.ListFile( { - "title": part, - "parents": [{"id": parent_id}], - "mimeType": self.FOLDER_MIME_TYPE, + "q": "'%s' in parents and trashed=false and title='%s'" + % (parent_id, name) } - ) - gdrive_file.Upload() - file_id = gdrive_file["id"] - return file_id - - def resolve_file_id(self, file_id, parent_id, path_parts, create): - file_list = self.drive.ListFile( - {"q": "'%s' in parents and trashed=false" % parent_id} - ).GetList() - - for part in path_parts: - file_id, parent_id, file_list = self.resolve_file_id_from_part( - part, parent_id, file_list - ) - file_id = self.create_file_id(file_id, parent_id, part, create) - if file_id == "": - break - return file_id + ).GetList() + ), + None, + ) + + def resolve_remote_file(self, parent_id, path_parts, create): + for path_part in path_parts: + item = self.get_drive_item(path_part, parent_id) + if not item: + if create: + item = self.create_drive_item(parent_id, path_part) + else: + break + parent_id = item["id"] + return item def get_path_id(self, path_info, create=False): - file_id = "" parts = path_info.path.split("/") if parts and (parts[0] in self.cached_root_dirs): @@ -139,7 +134,8 @@ def get_path_id(self, path_info, create=False): if not parts and file_id: return file_id - return self.resolve_file_id(file_id, parent_id, parts, create) + file1 = self.resolve_remote_file(parent_id, parts, create) + return file1["id"] if file1 else "" def exists(self, path_info): return self.get_path_id(path_info) != "" @@ -152,7 +148,6 @@ def batch_exists(self, path_infos, callback): return results def _upload(self, from_file, to_info, name, no_progress_bar): - dirname = to_info.parent if dirname: parent_id = self.get_path_id(dirname, True) From ae6ab5fceb22dda86f1d228adebc30c763fa2395 Mon Sep 17 00:00:00 2001 From: Max Risuhin Date: Sat, 12 Oct 2019 05:47:36 -0700 Subject: [PATCH 16/33] Fix climate issues --- dvc/remote/gdrive/__init__.py | 31 +++++++++++++++++++------------ 1 file changed, 19 insertions(+), 12 deletions(-) diff --git a/dvc/remote/gdrive/__init__.py b/dvc/remote/gdrive/__init__.py index 8d4bb81d63..6fc8acdfea 100644 --- a/dvc/remote/gdrive/__init__.py +++ b/dvc/remote/gdrive/__init__.py @@ -113,23 +113,26 @@ def get_drive_item(self, name, parent_id): def resolve_remote_file(self, parent_id, path_parts, create): for path_part in path_parts: item = self.get_drive_item(path_part, parent_id) - if not item: - if create: - item = self.create_drive_item(parent_id, path_part) - else: - break + if not item and create: + item = self.create_drive_item(parent_id, path_part) + elif not item: + return None parent_id = item["id"] return item - def get_path_id(self, path_info, create=False): + def get_path_id_from_cache(self, path_info): + file_id = "" parts = path_info.path.split("/") - if parts and (parts[0] in self.cached_root_dirs): parent_id = self.cached_root_dirs[parts[0]] file_id = self.cached_root_dirs[parts[0]] parts.pop(0) else: parent_id = path_info.netloc + return file_id, parent_id, parts + + def get_path_id(self, path_info, create=False): + file_id, parent_id, parts = self.get_path_id_from_cache(path_info) if not parts and file_id: return file_id @@ -190,12 +193,16 @@ def list_cache_paths(self): def walk(self, path_info): raise NotImplementedError + def list_file_path(self, drive_file): + if drive_file["mimeType"] == self.FOLDER_MIME_TYPE: + for i in self.list_path(drive_file["id"]): + yield drive_file["title"] + "/" + i + else: + yield drive_file["title"] + def list_path(self, parent_id): for file1 in self.list_drive_item( "'%s' in parents and trashed=false" % parent_id ): - if file1["mimeType"] == self.FOLDER_MIME_TYPE: - for i in self.list_path(file1["id"]): - yield file1["title"] + "/" + i - else: - yield file1["title"] + for path in self.list_file_path(file1): + yield path From 91b4da96460b7180468d358f74bb2601ef6f8286 Mon Sep 17 00:00:00 2001 From: Max Risuhin Date: Sun, 13 Oct 2019 17:15:17 -0700 Subject: [PATCH 17/33] Create PyDrive instance on init of Remote --- dvc/remote/gdrive/__init__.py | 39 +++++++++++++++++------------------ tests/func/test_data_cloud.py | 5 +---- 2 files changed, 20 insertions(+), 24 deletions(-) diff --git a/dvc/remote/gdrive/__init__.py b/dvc/remote/gdrive/__init__.py index 6fc8acdfea..a2279b6f32 100644 --- a/dvc/remote/gdrive/__init__.py +++ b/dvc/remote/gdrive/__init__.py @@ -38,7 +38,10 @@ def __init__(self, repo, config): self.DEFAULT_GOOGLE_AUTH_SETTINGS_PATH, ) self.path_info = self.path_cls(config[Config.SECTION_REMOTE_URL]) - self._drive = None + self.init_drive() + + def init_drive(self): + self.get_path_id(self.path_info, create=True) def list_drive_item(self, query): for page in self.drive.ListFile({"q": query, "maxResults": 1000}): @@ -54,35 +57,31 @@ def cached_root_dirs(self): cached_dirs[dir1["title"]] = dir1["id"] return cached_dirs - @property + @cached_property def raw_drive(self): from pydrive.auth import GoogleAuth from pydrive.drive import GoogleDrive import logging - if self._drive is None: - if os.getenv("PYDRIVE_USER_CREDENTIALS_FILE_CONTENT"): - with open("credentials.json", "w") as credentials_file: - credentials_file.write( - os.getenv("PYDRIVE_USER_CREDENTIALS_FILE_CONTENT") - ) + if os.getenv("PYDRIVE_USER_CREDENTIALS_DATA"): + with open("credentials.json", "w") as credentials_file: + credentials_file.write( + os.getenv("PYDRIVE_USER_CREDENTIALS_DATA") + ) - # Supress import error on GoogleAuth warning - logging.getLogger("googleapiclient.discovery_cache").setLevel( - logging.ERROR - ) - - GoogleAuth.DEFAULT_SETTINGS["client_config_backend"] = "settings" - gauth = GoogleAuth(settings_file=self.gdrive_credentials_path) - gauth.CommandLineAuth() - self._drive = GoogleDrive(gauth) + # Supress import error on GoogleAuth warning + logging.getLogger("googleapiclient.discovery_cache").setLevel( + logging.ERROR + ) - self.get_path_id(self.path_info, create=True) - return self._drive + GoogleAuth.DEFAULT_SETTINGS["client_config_backend"] = "settings" + gauth = GoogleAuth(settings_file=self.gdrive_credentials_path) + gauth.CommandLineAuth() + return GoogleDrive(gauth) @property @ratelimit.sleep_and_retry - @ratelimit.limits(calls=8, period=10) + @ratelimit.limits(calls=10, period=10) def drive(self): return self.raw_drive diff --git a/tests/func/test_data_cloud.py b/tests/func/test_data_cloud.py index 7ac0210867..9524ebcaa0 100644 --- a/tests/func/test_data_cloud.py +++ b/tests/func/test_data_cloud.py @@ -71,9 +71,7 @@ def _should_test_aws(): def _should_test_gdrive(): - if os.getenv("DVC_TEST_GDRIVE") == "true" or os.getenv( - "PYDRIVE_USER_CREDENTIALS_FILE_CONTENT" - ): + if os.getenv("PYDRIVE_USER_CREDENTIALS_DATA"): return True return False @@ -248,7 +246,6 @@ def test(self): clist = [ ("s3://mybucket/", RemoteS3), - ("gdrive://root/", RemoteGDrive), ("gs://mybucket/", RemoteGS), ("ssh://user@localhost:/", RemoteSSH), ("http://localhost:8000/", RemoteHTTP), From b6f87e7bb4d7c6a4d793af057907a7b1277014a6 Mon Sep 17 00:00:00 2001 From: Max Risuhin Date: Mon, 14 Oct 2019 02:01:20 -0700 Subject: [PATCH 18/33] Increase API rate limit to its maximum - 10 calls per second --- dvc/remote/gdrive/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dvc/remote/gdrive/__init__.py b/dvc/remote/gdrive/__init__.py index a2279b6f32..9c098fd130 100644 --- a/dvc/remote/gdrive/__init__.py +++ b/dvc/remote/gdrive/__init__.py @@ -81,7 +81,7 @@ def raw_drive(self): @property @ratelimit.sleep_and_retry - @ratelimit.limits(calls=10, period=10) + @ratelimit.limits(calls=10, period=1) def drive(self): return self.raw_drive From 0b08ab3226609f03038e06b5f73bcc54433a62b9 Mon Sep 17 00:00:00 2001 From: Max Risuhin Date: Tue, 15 Oct 2019 16:53:43 -0700 Subject: [PATCH 19/33] Fix code review findings --- dvc/remote/gdrive/__init__.py | 57 ++++++++++------------------ tests/conftest.py | 9 ----- tests/unit/remote/gdrive/conftest.py | 8 +--- 3 files changed, 22 insertions(+), 52 deletions(-) diff --git a/dvc/remote/gdrive/__init__.py b/dvc/remote/gdrive/__init__.py index 9c098fd130..1e2bdab61d 100644 --- a/dvc/remote/gdrive/__init__.py +++ b/dvc/remote/gdrive/__init__.py @@ -1,6 +1,7 @@ from __future__ import unicode_literals import os +import posixpath from funcy import cached_property import ratelimit @@ -52,7 +53,7 @@ def list_drive_item(self, query): def cached_root_dirs(self): cached_dirs = {} for dir1 in self.list_drive_item( - "'%s' in parents and trashed=false" % self.path_info.netloc + "'{}' in parents and trashed=false".format(self.path_info.netloc) ): cached_dirs[dir1["title"]] = dir1["id"] return cached_dirs @@ -77,11 +78,12 @@ def raw_drive(self): GoogleAuth.DEFAULT_SETTINGS["client_config_backend"] = "settings" gauth = GoogleAuth(settings_file=self.gdrive_credentials_path) gauth.CommandLineAuth() - return GoogleDrive(gauth) + gdrive = GoogleDrive(gauth) + return gdrive @property @ratelimit.sleep_and_retry - @ratelimit.limits(calls=10, period=1) + @ratelimit.limits(calls=8, period=1.2) def drive(self): return self.raw_drive @@ -97,17 +99,14 @@ def create_drive_item(self, parent_id, title): return item def get_drive_item(self, name, parent_id): - return next( - iter( - self.drive.ListFile( - { - "q": "'%s' in parents and trashed=false and title='%s'" - % (parent_id, name) - } - ).GetList() - ), - None, - ) + item_list = self.drive.ListFile( + { + "q": "'{}' in parents and trashed=false and title='{}'".format( + parent_id, name + ) + } + ).GetList() + return next(iter(item_list), None) def resolve_remote_file(self, parent_id, path_parts, create): for path_part in path_parts: @@ -142,13 +141,6 @@ def get_path_id(self, path_info, create=False): def exists(self, path_info): return self.get_path_id(path_info) != "" - def batch_exists(self, path_infos, callback): - results = [] - for path_info in path_infos: - results.append(self.exists(path_info)) - callback.update(str(path_info)) - return results - def _upload(self, from_file, to_info, name, no_progress_bar): dirname = to_info.parent if dirname: @@ -160,14 +152,13 @@ def _upload(self, from_file, to_info, name, no_progress_bar): {"title": to_info.name, "parents": [{"id": parent_id}]} ) - from_file = open(from_file, "rb") - if not no_progress_bar: - from_file = TrackFileReadProgress(name, from_file) + with open(from_file, "rb") as from_file: + if not no_progress_bar: + from_file = TrackFileReadProgress(name, from_file) - file1.content = from_file + file1.content = from_file - file1.Upload() - from_file.close() + file1.Upload() def _download(self, from_info, to_file, name, no_progress_bar): from dvc.progress import Tqdm @@ -180,28 +171,22 @@ def _download(self, from_info, to_file, name, no_progress_bar): if not no_progress_bar: tqdm.close() - def get_file_checksum(self, path_info): - raise NotImplementedError - def list_cache_paths(self): file_id = self.get_path_id(self.path_info) prefix = self.path_info.path for path in self.list_path(file_id): - yield prefix + "/" + path - - def walk(self, path_info): - raise NotImplementedError + yield posixpath.join(prefix, path) def list_file_path(self, drive_file): if drive_file["mimeType"] == self.FOLDER_MIME_TYPE: for i in self.list_path(drive_file["id"]): - yield drive_file["title"] + "/" + i + yield posixpath.join(drive_file["title"], i) else: yield drive_file["title"] def list_path(self, parent_id): for file1 in self.list_drive_item( - "'%s' in parents and trashed=false" % parent_id + "'{}' in parents and trashed=false".format(parent_id) ): for path in self.list_file_path(file1): yield path diff --git a/tests/conftest.py b/tests/conftest.py index f1f8c90ce5..9ca00335fb 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -6,7 +6,6 @@ from git.exc import GitCommandNotFound from dvc.remote.config import RemoteConfig -from dvc.remote.gdrive import RemoteGDrive from dvc.utils.compat import cast_bytes_py2 from dvc.remote.ssh.connection import SSHConnection from dvc.repo import Repo as DvcRepo @@ -18,14 +17,6 @@ os.environ[cast_bytes_py2("DVC_IGNORE_ISATTY")] = cast_bytes_py2("true") -# Make DVC tests use separate OAuth token to access Google Drive -def skip_pydrive_init(_): - pass - - -RemoteGDrive.init_gdrive = skip_pydrive_init - - @pytest.fixture(autouse=True) def reset_loglevel(request, caplog): """ diff --git a/tests/unit/remote/gdrive/conftest.py b/tests/unit/remote/gdrive/conftest.py index 4f9c8a3d21..035ca15094 100644 --- a/tests/unit/remote/gdrive/conftest.py +++ b/tests/unit/remote/gdrive/conftest.py @@ -1,15 +1,9 @@ import pytest -from dvc.repo import Repo from dvc.remote.gdrive import RemoteGDrive -@pytest.fixture() -def repo(): - return Repo(".") - - @pytest.fixture def gdrive(repo): - ret = RemoteGDrive(repo, {"url": "gdrive://root/data"}) + ret = RemoteGDrive(None, {"url": "gdrive://root/data"}) return ret From 934be73927d1ac1f369d351bfdde1fc262e8cf48 Mon Sep 17 00:00:00 2001 From: Max Risuhin Date: Sun, 20 Oct 2019 10:03:11 -0700 Subject: [PATCH 20/33] Wrap GDrive API calls into single function with backoff on exception --- dvc/remote/gdrive/__init__.py | 207 ++++++++++++++++++++++++++-------- 1 file changed, 161 insertions(+), 46 deletions(-) diff --git a/dvc/remote/gdrive/__init__.py b/dvc/remote/gdrive/__init__.py index 1e2bdab61d..97806a8eb8 100644 --- a/dvc/remote/gdrive/__init__.py +++ b/dvc/remote/gdrive/__init__.py @@ -4,14 +4,15 @@ import posixpath from funcy import cached_property -import ratelimit +from ratelimit import limits, sleep_and_retry +from backoff import on_exception, expo from dvc.scheme import Schemes from dvc.path_info import CloudURLInfo from dvc.remote.base import RemoteBASE from dvc.config import Config from dvc.remote.gdrive.utils import TrackFileReadProgress, shared_token_warning - +from dvc.exceptions import DvcException class GDriveURLInfo(CloudURLInfo): @property @@ -19,6 +20,87 @@ def netloc(self): return self.parsed.netloc +class RequestBASE: + def __init__(self, drive): + self.drive = drive + + def execute(self): + raise NotImplementedError + + +class RequestListFile(RequestBASE): + def __init__(self, drive, query): + super(RequestListFile, self).__init__(drive) + self.query = query + + def execute(self): + return self.drive.ListFile({"q": self.query, "maxResults": 1000}).GetList() + + +class RequestUploadFile(RequestBASE): + def __init__( + self, + drive, + title, + parent_id, + mime_type, + no_progress_bar=True, + from_file="", + progress_name="", + ): + super(RequestUploadFile, self).__init__(drive) + self.title = title + self.parent_id = parent_id + self.mime_type = mime_type + self.no_progress_bar = no_progress_bar + self.from_file = from_file + self.proress_name = progress_name + + def execute(self): + item = self.drive.CreateFile( + { + "title": self.title, + "parents": [{"id": self.parent_id}], + "mimeType": self.mime_type, + } + ) + if self.mime_type == RemoteGDrive.FOLDER_MIME_TYPE: + item.Upload() + else: + with open(self.from_file, "rb") as from_file: + if not self.no_progress_bar: + from_file = TrackFileReadProgress( + self.proress_name, from_file + ) + if os.stat(self.from_file).st_size: + item.content = from_file + item.Upload() + return item + + +class RequestDownloadFile(RequestBASE): + def __init__( + self, drive, file_id, to_file, progress_name, no_progress_bar=True + ): + super(RequestDownloadFile, self).__init__(drive) + self.file_id = file_id + self.to_file = to_file + self.progress_name = progress_name + self.no_progress_bar = no_progress_bar + + def execute(self): + from dvc.progress import Tqdm + + gdrive_file = self.drive.CreateFile({"id": self.file_id}) + if not self.no_progress_bar: + tqdm = Tqdm( + desc=self.progress_name, total=int(gdrive_file["fileSize"]) + ) + gdrive_file.GetContentFile(self.to_file) + if not self.no_progress_bar: + tqdm.close() + + class RemoteGDrive(RemoteBASE): scheme = Schemes.GDRIVE path_cls = GDriveURLInfo @@ -32,6 +114,7 @@ class RemoteGDrive(RemoteBASE): def __init__(self, repo, config): super(RemoteGDrive, self).__init__(repo, config) + self.no_traverse = False if Config.SECTION_GDRIVE_CREDENTIALPATH not in config: shared_token_warning() self.gdrive_credentials_path = config.get( @@ -42,21 +125,38 @@ def __init__(self, repo, config): self.init_drive() def init_drive(self): - self.get_path_id(self.path_info, create=True) + self.root_id = self.get_path_id(self.path_info, create=True) + + @on_exception(expo, DvcException, max_tries=8) + @sleep_and_retry + @limits(calls=10, period=1) + def execute_request(self, request): + try: + result = request.execute() + except Exception as exception: + if ('Rate Limit Exceeded' in str(exception)): + raise DvcException("API usage rate limit exceeded") + raise + return result def list_drive_item(self, query): - for page in self.drive.ListFile({"q": query, "maxResults": 1000}): - for item in page: - yield item + list_request = RequestListFile(self.drive, query) + for item in self.execute_request(list_request): + yield item + #for page in self.execute_request(list_request): + # for item in page: + # yield item @cached_property def cached_root_dirs(self): - cached_dirs = {} + self.cached_dirs = {} + self.cached_dir_id = {} for dir1 in self.list_drive_item( - "'{}' in parents and trashed=false".format(self.path_info.netloc) + "'{}' in parents and trashed=false".format(self.root_id) ): - cached_dirs[dir1["title"]] = dir1["id"] - return cached_dirs + self.cached_dirs[dir1["title"]] = dir1["id"] + self.cached_dir_id[dir1["id"]] = dir1["title"] + return self.cached_dirs @cached_property def raw_drive(self): @@ -82,30 +182,24 @@ def raw_drive(self): return gdrive @property - @ratelimit.sleep_and_retry - @ratelimit.limits(calls=8, period=1.2) def drive(self): return self.raw_drive def create_drive_item(self, parent_id, title): - item = self.drive.CreateFile( - { - "title": title, - "parents": [{"id": parent_id}], - "mimeType": self.FOLDER_MIME_TYPE, - } + upload_request = RequestUploadFile( + self.drive, title, parent_id, self.FOLDER_MIME_TYPE ) - item.Upload() - return item + result = self.execute_request(upload_request) + return result def get_drive_item(self, name, parent_id): - item_list = self.drive.ListFile( - { - "q": "'{}' in parents and trashed=false and title='{}'".format( - parent_id, name - ) - } - ).GetList() + list_request = RequestListFile( + self.drive, + "'{}' in parents and trashed=false and title='{}'".format( + parent_id, name + ), + ) + item_list = self.execute_request(list_request) return next(iter(item_list), None) def resolve_remote_file(self, parent_id, path_parts, create): @@ -121,7 +215,11 @@ def resolve_remote_file(self, parent_id, path_parts, create): def get_path_id_from_cache(self, path_info): file_id = "" parts = path_info.path.split("/") - if parts and (parts[0] in self.cached_root_dirs): + if ( + path_info != self.path_info + and parts + and (parts[0] in self.cached_root_dirs) + ): parent_id = self.cached_root_dirs[parts[0]] file_id = self.cached_root_dirs[parts[0]] parts.pop(0) @@ -148,28 +246,23 @@ def _upload(self, from_file, to_info, name, no_progress_bar): else: parent_id = to_info.netloc - file1 = self.drive.CreateFile( - {"title": to_info.name, "parents": [{"id": parent_id}]} + upload_request = RequestUploadFile( + self.drive, + to_info.name, + parent_id, + "", + no_progress_bar, + from_file, + name, ) - - with open(from_file, "rb") as from_file: - if not no_progress_bar: - from_file = TrackFileReadProgress(name, from_file) - - file1.content = from_file - - file1.Upload() + self.execute_request(upload_request) def _download(self, from_info, to_file, name, no_progress_bar): - from dvc.progress import Tqdm - file_id = self.get_path_id(from_info) - gdrive_file = self.drive.CreateFile({"id": file_id}) - if not no_progress_bar: - tqdm = Tqdm(desc=name, total=int(gdrive_file["fileSize"])) - gdrive_file.GetContentFile(to_file) - if not no_progress_bar: - tqdm.close() + download_request = RequestDownloadFile( + self.drive, file_id, to_file, name, no_progress_bar + ) + self.execute_request(download_request) def list_cache_paths(self): file_id = self.get_path_id(self.path_info) @@ -190,3 +283,25 @@ def list_path(self, parent_id): ): for path in self.list_file_path(file1): yield path + + def all(self): + query = " or ".join( + "'{}' in parents".format(dir_id) + for dir_title, dir_id in self.cached_root_dirs.items() + ) + if not query: + return + query += " and trashed=false" + print("All query: {}".format(query)) + for file1 in self.list_drive_item(query): + parent_id = file1["parents"][0]["id"] + print(self.cached_dir_id[parent_id]) + print(file1["title"]) + path = posixpath.join( + self.cached_dir_id[parent_id], file1["title"] + ) + try: + yield self.path_to_checksum(path) + except ValueError: + # We ignore all the non-cache looking files + pass From 03951fc9e0d500adcf9b67ae518b9f7f3db9d38d Mon Sep 17 00:00:00 2001 From: Max Risuhin Date: Wed, 23 Oct 2019 11:17:29 -0700 Subject: [PATCH 21/33] Support pagination of GDrive API response; Fix code climate issues --- dvc/remote/gdrive/__init__.py | 154 ++++++++++------------------------ dvc/remote/gdrive/pydrive.py | 93 ++++++++++++++++++++ 2 files changed, 138 insertions(+), 109 deletions(-) create mode 100644 dvc/remote/gdrive/pydrive.py diff --git a/dvc/remote/gdrive/__init__.py b/dvc/remote/gdrive/__init__.py index 97806a8eb8..8e43995b44 100644 --- a/dvc/remote/gdrive/__init__.py +++ b/dvc/remote/gdrive/__init__.py @@ -11,8 +11,15 @@ from dvc.path_info import CloudURLInfo from dvc.remote.base import RemoteBASE from dvc.config import Config -from dvc.remote.gdrive.utils import TrackFileReadProgress, shared_token_warning +from dvc.remote.gdrive.utils import shared_token_warning from dvc.exceptions import DvcException +from dvc.remote.gdrive.pydrive import ( + RequestListFile, + RequestListFilePaginated, + RequestUploadFile, + RequestDownloadFile, +) + class GDriveURLInfo(CloudURLInfo): @property @@ -20,87 +27,6 @@ def netloc(self): return self.parsed.netloc -class RequestBASE: - def __init__(self, drive): - self.drive = drive - - def execute(self): - raise NotImplementedError - - -class RequestListFile(RequestBASE): - def __init__(self, drive, query): - super(RequestListFile, self).__init__(drive) - self.query = query - - def execute(self): - return self.drive.ListFile({"q": self.query, "maxResults": 1000}).GetList() - - -class RequestUploadFile(RequestBASE): - def __init__( - self, - drive, - title, - parent_id, - mime_type, - no_progress_bar=True, - from_file="", - progress_name="", - ): - super(RequestUploadFile, self).__init__(drive) - self.title = title - self.parent_id = parent_id - self.mime_type = mime_type - self.no_progress_bar = no_progress_bar - self.from_file = from_file - self.proress_name = progress_name - - def execute(self): - item = self.drive.CreateFile( - { - "title": self.title, - "parents": [{"id": self.parent_id}], - "mimeType": self.mime_type, - } - ) - if self.mime_type == RemoteGDrive.FOLDER_MIME_TYPE: - item.Upload() - else: - with open(self.from_file, "rb") as from_file: - if not self.no_progress_bar: - from_file = TrackFileReadProgress( - self.proress_name, from_file - ) - if os.stat(self.from_file).st_size: - item.content = from_file - item.Upload() - return item - - -class RequestDownloadFile(RequestBASE): - def __init__( - self, drive, file_id, to_file, progress_name, no_progress_bar=True - ): - super(RequestDownloadFile, self).__init__(drive) - self.file_id = file_id - self.to_file = to_file - self.progress_name = progress_name - self.no_progress_bar = no_progress_bar - - def execute(self): - from dvc.progress import Tqdm - - gdrive_file = self.drive.CreateFile({"id": self.file_id}) - if not self.no_progress_bar: - tqdm = Tqdm( - desc=self.progress_name, total=int(gdrive_file["fileSize"]) - ) - gdrive_file.GetContentFile(self.to_file) - if not self.no_progress_bar: - tqdm.close() - - class RemoteGDrive(RemoteBASE): scheme = Schemes.GDRIVE path_cls = GDriveURLInfo @@ -134,32 +60,32 @@ def execute_request(self, request): try: result = request.execute() except Exception as exception: - if ('Rate Limit Exceeded' in str(exception)): + if "Rate Limit Exceeded" in str(exception): raise DvcException("API usage rate limit exceeded") raise return result def list_drive_item(self, query): - list_request = RequestListFile(self.drive, query) - for item in self.execute_request(list_request): - yield item - #for page in self.execute_request(list_request): - # for item in page: - # yield item + list_request = RequestListFilePaginated(self.drive, query) + page_list = self.execute_request(list_request) + while page_list: + for item in page_list: + yield item + page_list = self.execute_request(list_request) @cached_property def cached_root_dirs(self): - self.cached_dirs = {} - self.cached_dir_id = {} + cached_dirs = {} + self.cached_ids = {} for dir1 in self.list_drive_item( "'{}' in parents and trashed=false".format(self.root_id) ): - self.cached_dirs[dir1["title"]] = dir1["id"] - self.cached_dir_id[dir1["id"]] = dir1["title"] - return self.cached_dirs + cached_dirs[dir1["title"]] = dir1["id"] + self.cached_ids[dir1["id"]] = dir1["title"] + return cached_dirs @cached_property - def raw_drive(self): + def drive(self): from pydrive.auth import GoogleAuth from pydrive.drive import GoogleDrive import logging @@ -181,13 +107,14 @@ def raw_drive(self): gdrive = GoogleDrive(gauth) return gdrive - @property - def drive(self): - return self.raw_drive - def create_drive_item(self, parent_id, title): upload_request = RequestUploadFile( - self.drive, title, parent_id, self.FOLDER_MIME_TYPE + { + "drive": self.drive, + "title": title, + "parent_id": parent_id, + "mime_type": self.FOLDER_MIME_TYPE, + } ) result = self.execute_request(upload_request) return result @@ -247,10 +174,12 @@ def _upload(self, from_file, to_info, name, no_progress_bar): parent_id = to_info.netloc upload_request = RequestUploadFile( - self.drive, - to_info.name, - parent_id, - "", + { + "drive": self.drive, + "title": to_info.name, + "parent_id": parent_id, + "mime_type": "", + }, no_progress_bar, from_file, name, @@ -260,7 +189,13 @@ def _upload(self, from_file, to_info, name, no_progress_bar): def _download(self, from_info, to_file, name, no_progress_bar): file_id = self.get_path_id(from_info) download_request = RequestDownloadFile( - self.drive, file_id, to_file, name, no_progress_bar + { + "drive": self.drive, + "file_id": file_id, + "to_file": to_file, + "progress_name": name, + "no_progress_bar": no_progress_bar, + } ) self.execute_request(download_request) @@ -293,13 +228,14 @@ def all(self): return query += " and trashed=false" print("All query: {}".format(query)) + counter = 0 for file1 in self.list_drive_item(query): parent_id = file1["parents"][0]["id"] - print(self.cached_dir_id[parent_id]) + print(self.cached_ids[parent_id]) print(file1["title"]) - path = posixpath.join( - self.cached_dir_id[parent_id], file1["title"] - ) + counter += 1 + print("{}".format(counter)) + path = posixpath.join(self.cached_ids[parent_id], file1["title"]) try: yield self.path_to_checksum(path) except ValueError: diff --git a/dvc/remote/gdrive/pydrive.py b/dvc/remote/gdrive/pydrive.py new file mode 100644 index 0000000000..812af3fcfa --- /dev/null +++ b/dvc/remote/gdrive/pydrive.py @@ -0,0 +1,93 @@ +import os + +from dvc.remote.gdrive.utils import TrackFileReadProgress + + +class RequestBASE: + def __init__(self, drive): + self.drive = drive + + def execute(self): + raise NotImplementedError + + +class RequestListFile(RequestBASE): + def __init__(self, drive, query): + super(RequestListFile, self).__init__(drive) + self.query = query + + def execute(self): + return self.drive.ListFile( + {"q": self.query, "maxResults": 1000} + ).GetList() + + +class RequestListFilePaginated(RequestBASE): + def __init__(self, drive, query): + super(RequestListFilePaginated, self).__init__(drive) + self.query = query + self.iter = None + + def execute(self): + if not self.iter: + self.iter = iter( + self.drive.ListFile({"q": self.query, "maxResults": 1000}) + ) + return next(self.iter, None) + + +class RequestUploadFile(RequestBASE): + def __init__( + self, args, no_progress_bar=True, from_file="", progress_name="" + ): + super(RequestUploadFile, self).__init__(args["drive"]) + self.title = args["title"] + self.parent_id = args["parent_id"] + self.mime_type = args["mime_type"] + self.no_progress_bar = no_progress_bar + self.from_file = from_file + self.proress_name = progress_name + + def upload(self, item): + with open(self.from_file, "rb") as from_file: + if not self.no_progress_bar: + from_file = TrackFileReadProgress(self.proress_name, from_file) + if os.stat(self.from_file).st_size: + item.content = from_file + item.Upload() + + def execute(self): + item = self.drive.CreateFile( + { + "title": self.title, + "parents": [{"id": self.parent_id}], + "mimeType": self.mime_type, + } + ) + if self.mime_type == "application/vnd.google-apps.folder": + item.Upload() + else: + self.upload(item) + + return item + + +class RequestDownloadFile(RequestBASE): + def __init__(self, args): + super(RequestDownloadFile, self).__init__(args["drive"]) + self.file_id = args["file_id"] + self.to_file = args["to_file"] + self.progress_name = args["progress_name"] + self.no_progress_bar = args["no_progress_bar"] + + def execute(self): + from dvc.progress import Tqdm + + gdrive_file = self.drive.CreateFile({"id": self.file_id}) + if not self.no_progress_bar: + tqdm = Tqdm( + desc=self.progress_name, total=int(gdrive_file["fileSize"]) + ) + gdrive_file.GetContentFile(self.to_file) + if not self.no_progress_bar: + tqdm.close() From 22d7fd73c5a82f967edab220a5c930671266aaf6 Mon Sep 17 00:00:00 2001 From: Max Risuhin Date: Wed, 23 Oct 2019 11:41:01 -0700 Subject: [PATCH 22/33] Add missed dep to setup --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index d1d0cf3d15..19f523979a 100644 --- a/setup.py +++ b/setup.py @@ -86,7 +86,7 @@ def run(self): # Extra dependencies for remote integrations gs = ["google-cloud-storage==1.19.0"] -gdrive = ["pydrive==1.3.1", "ratelimit==2.2.1"] +gdrive = ["pydrive==1.3.1", "ratelimit==2.2.1", "backoff>=1.8.1"] s3 = ["boto3==1.9.115"] azure = ["azure-storage-blob==2.1.0"] oss = ["oss2==2.6.1"] From 917f5c0d4ad12d9629d9848d25107bb26eb9c536 Mon Sep 17 00:00:00 2001 From: Max Risuhin Date: Wed, 23 Oct 2019 12:17:56 -0700 Subject: [PATCH 23/33] Fix tests --- tests/unit/remote/gdrive/test_gdrive.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/unit/remote/gdrive/test_gdrive.py b/tests/unit/remote/gdrive/test_gdrive.py index 6b35123149..9757f03661 100644 --- a/tests/unit/remote/gdrive/test_gdrive.py +++ b/tests/unit/remote/gdrive/test_gdrive.py @@ -2,14 +2,14 @@ from dvc.remote.gdrive import RemoteGDrive -@mock.patch("dvc.remote.gdrive.RemoteGDrive.drive") +@mock.patch("dvc.remote.gdrive.RemoteGDrive.init_drive") def test_init_drive(repo): url = "gdrive://root/data" gdrive = RemoteGDrive(repo, {"url": url}) assert str(gdrive.path_info) == url -@mock.patch("dvc.remote.gdrive.RemoteGDrive.drive") +@mock.patch("dvc.remote.gdrive.RemoteGDrive.init_drive") def test_init_folder_id(repo): url = "gdrive://folder_id/data" gdrive = RemoteGDrive(repo, {"url": url}) From 1abb73501c98ab92c88d70ce8bdd6e757dee6f81 Mon Sep 17 00:00:00 2001 From: Max Risuhin Date: Sat, 26 Oct 2019 04:14:13 -0700 Subject: [PATCH 24/33] Support multiple directories with similar titles --- dvc/config.py | 5 +++ dvc/remote/gdrive/__init__.py | 84 ++++++++++++++++++++++++----------- 2 files changed, 63 insertions(+), 26 deletions(-) diff --git a/dvc/config.py b/dvc/config.py index 2ba234fd0f..a780a4b5de 100644 --- a/dvc/config.py +++ b/dvc/config.py @@ -247,7 +247,11 @@ class Config(object): # pylint: disable=too-many-instance-attributes Optional(SECTION_GCP_PROJECTNAME): str, } + SECTION_GDRIVE = "gdrive" SECTION_GDRIVE_CREDENTIALPATH = CREDENTIALPATH + SECTION_GDRIVE_SCHEMA = { + Optional(SECTION_GDRIVE_CREDENTIALPATH): str, + } # backward compatibility SECTION_LOCAL = "local" @@ -316,6 +320,7 @@ class Config(object): # pylint: disable=too-many-instance-attributes Optional(SECTION_AWS, default={}): SECTION_AWS_SCHEMA, Optional(SECTION_GCP, default={}): SECTION_GCP_SCHEMA, Optional(SECTION_LOCAL, default={}): SECTION_LOCAL_SCHEMA, + Optional(SECTION_GDRIVE, default={}): SECTION_GDRIVE_SCHEMA, } def __init__(self, dvc_dir=None, validate=True): diff --git a/dvc/remote/gdrive/__init__.py b/dvc/remote/gdrive/__init__.py index 8e43995b44..f5d7e457df 100644 --- a/dvc/remote/gdrive/__init__.py +++ b/dvc/remote/gdrive/__init__.py @@ -47,11 +47,15 @@ def __init__(self, repo, config): Config.SECTION_GDRIVE_CREDENTIALPATH, self.DEFAULT_GOOGLE_AUTH_SETTINGS_PATH, ) + core = config.get(Config.SECTION_GDRIVE, {}) + print("Credentials path: {} , {}".format(self.gdrive_credentials_path, core)) self.path_info = self.path_cls(config[Config.SECTION_REMOTE_URL]) + print("!!!!!!!!!!!!!!!!! Init") self.init_drive() def init_drive(self): self.root_id = self.get_path_id(self.path_info, create=True) + self.cache_root_dirs() @on_exception(expo, DvcException, max_tries=8) @sleep_and_retry @@ -73,16 +77,17 @@ def list_drive_item(self, query): yield item page_list = self.execute_request(list_request) - @cached_property - def cached_root_dirs(self): - cached_dirs = {} + def cache_root_dirs(self): + print("Gather cache...........................................") + self.cached_dirs = {} self.cached_ids = {} for dir1 in self.list_drive_item( "'{}' in parents and trashed=false".format(self.root_id) ): - cached_dirs[dir1["title"]] = dir1["id"] + self.cached_dirs.setdefault(dir1["title"], []).append(dir1["id"]) + print("Cashing {} with id {}".format(dir1["title"], dir1["id"])) self.cached_ids[dir1["id"]] = dir1["title"] - return cached_dirs + print("Cached root dir content: {}".format(self.cached_dirs)) @cached_property def drive(self): @@ -119,48 +124,73 @@ def create_drive_item(self, parent_id, title): result = self.execute_request(upload_request) return result - def get_drive_item(self, name, parent_id): + def get_drive_item(self, name, parents_ids): + print('get_drive_item for parents_ids {}'.format(parents_ids)) + query = " or ".join( + "'{}' in parents".format(parent_id) + for parent_id in parents_ids + ) + if not query: + return + query += " and trashed=false and title='{}'".format(name) + print("get_drive_item query: {}".format(query)) + list_request = RequestListFile( self.drive, - "'{}' in parents and trashed=false and title='{}'".format( - parent_id, name - ), + query, ) item_list = self.execute_request(list_request) return next(iter(item_list), None) - def resolve_remote_file(self, parent_id, path_parts, create): + def resolve_remote_file(self, parents_ids, path_parts, create): + print("resolve remote file for {}".format(path_parts)) for path_part in path_parts: - item = self.get_drive_item(path_part, parent_id) + item = self.get_drive_item(path_part, parents_ids) if not item and create: - item = self.create_drive_item(parent_id, path_part) + item = self.create_drive_item(parents_ids[0], path_part) elif not item: return None - parent_id = item["id"] + parents_ids = [item["id"]] return item + def subtract_root_path(self, parts): + parents_ids = [self.path_info.netloc] + if not hasattr(self, "root_id"): + return parts, parents_ids + + for part in self.path_info.path.split("/"): + print("subtract_root_path compare {} with {}".format(part, parts[0])) + if parts and parts[0] == part: + parts.pop(0) + parents_ids = [self.root_id] + else: + break + return parts, parents_ids + def get_path_id_from_cache(self, path_info): - file_id = "" - parts = path_info.path.split("/") + files_ids = [] + parts, parents_ids = self.subtract_root_path(path_info.path.split("/")) + print("Resolved parts: {}".format(parts)) if ( path_info != self.path_info and parts - and (parts[0] in self.cached_root_dirs) + and (parts[0] in self.cached_dirs) ): - parent_id = self.cached_root_dirs[parts[0]] - file_id = self.cached_root_dirs[parts[0]] + parents_ids = self.cached_dirs[parts[0]] + print('Parents_ids resolved from cash for {} as {}'.format(parts[0], self.cached_dirs[parts[0]])) + files_ids = self.cached_dirs[parts[0]] parts.pop(0) - else: - parent_id = path_info.netloc - return file_id, parent_id, parts + + return files_ids, parents_ids, parts def get_path_id(self, path_info, create=False): - file_id, parent_id, parts = self.get_path_id_from_cache(path_info) + print("get_path_id for path {}".format(path_info)) + files_ids, parents_ids, parts = self.get_path_id_from_cache(path_info) - if not parts and file_id: - return file_id + if not parts and files_ids: + return files_ids[0] - file1 = self.resolve_remote_file(parent_id, parts, create) + file1 = self.resolve_remote_file(parents_ids, parts, create) return file1["id"] if file1 else "" def exists(self, path_info): @@ -170,6 +200,7 @@ def _upload(self, from_file, to_info, name, no_progress_bar): dirname = to_info.parent if dirname: parent_id = self.get_path_id(dirname, True) + print("parent_id on upload resolved as: {}".format(parent_id)) else: parent_id = to_info.netloc @@ -220,9 +251,10 @@ def list_path(self, parent_id): yield path def all(self): + print('All') query = " or ".join( "'{}' in parents".format(dir_id) - for dir_title, dir_id in self.cached_root_dirs.items() + for dir_id in self.cached_ids ) if not query: return From e1ed0b1a5d8c68719b626f6d617ed464896ba964 Mon Sep 17 00:00:00 2001 From: Max Risuhin Date: Sat, 26 Oct 2019 05:26:39 -0700 Subject: [PATCH 25/33] Raise exception on missed gdrive settings file path in config --- MANIFEST.in | 1 - dvc/config.py | 5 --- dvc/remote/gdrive/__init__.py | 63 ++++++++------------------------- dvc/remote/gdrive/pydrive.py | 4 +-- dvc/remote/gdrive/settings.yaml | 14 -------- dvc/remote/gdrive/utils.py | 38 +------------------- 6 files changed, 17 insertions(+), 108 deletions(-) delete mode 100644 dvc/remote/gdrive/settings.yaml diff --git a/MANIFEST.in b/MANIFEST.in index 35e43bb296..03d2f17a9a 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,3 +1,2 @@ include fastentrypoints.py include LICENSE -include dvc/remote/gdrive/settings.yaml diff --git a/dvc/config.py b/dvc/config.py index a780a4b5de..2ba234fd0f 100644 --- a/dvc/config.py +++ b/dvc/config.py @@ -247,11 +247,7 @@ class Config(object): # pylint: disable=too-many-instance-attributes Optional(SECTION_GCP_PROJECTNAME): str, } - SECTION_GDRIVE = "gdrive" SECTION_GDRIVE_CREDENTIALPATH = CREDENTIALPATH - SECTION_GDRIVE_SCHEMA = { - Optional(SECTION_GDRIVE_CREDENTIALPATH): str, - } # backward compatibility SECTION_LOCAL = "local" @@ -320,7 +316,6 @@ class Config(object): # pylint: disable=too-many-instance-attributes Optional(SECTION_AWS, default={}): SECTION_AWS_SCHEMA, Optional(SECTION_GCP, default={}): SECTION_GCP_SCHEMA, Optional(SECTION_LOCAL, default={}): SECTION_LOCAL_SCHEMA, - Optional(SECTION_GDRIVE, default={}): SECTION_GDRIVE_SCHEMA, } def __init__(self, dvc_dir=None, validate=True): diff --git a/dvc/remote/gdrive/__init__.py b/dvc/remote/gdrive/__init__.py index f5d7e457df..91707dec9e 100644 --- a/dvc/remote/gdrive/__init__.py +++ b/dvc/remote/gdrive/__init__.py @@ -11,7 +11,6 @@ from dvc.path_info import CloudURLInfo from dvc.remote.base import RemoteBASE from dvc.config import Config -from dvc.remote.gdrive.utils import shared_token_warning from dvc.exceptions import DvcException from dvc.remote.gdrive.pydrive import ( RequestListFile, @@ -19,6 +18,7 @@ RequestUploadFile, RequestDownloadFile, ) +from dvc.remote.gdrive.utils import FOLDER_MIME_TYPE class GDriveURLInfo(CloudURLInfo): @@ -32,25 +32,21 @@ class RemoteGDrive(RemoteBASE): path_cls = GDriveURLInfo REGEX = r"^gdrive://.*$" REQUIRES = {"pydrive": "pydrive"} - PARAM_CHECKSUM = "md5Checksum" - DEFAULT_GOOGLE_AUTH_SETTINGS_PATH = os.path.join( - os.path.dirname(__file__), "settings.yaml" - ) - FOLDER_MIME_TYPE = "application/vnd.google-apps.folder" def __init__(self, repo, config): super(RemoteGDrive, self).__init__(repo, config) self.no_traverse = False + self.cached_dirs = {} + self.cached_ids = {} if Config.SECTION_GDRIVE_CREDENTIALPATH not in config: - shared_token_warning() + raise DvcException( + "Google Drive settings file path is missed from config. " + "Learn more at https://dvc.org/doc." + ) self.gdrive_credentials_path = config.get( - Config.SECTION_GDRIVE_CREDENTIALPATH, - self.DEFAULT_GOOGLE_AUTH_SETTINGS_PATH, + Config.SECTION_GDRIVE_CREDENTIALPATH ) - core = config.get(Config.SECTION_GDRIVE, {}) - print("Credentials path: {} , {}".format(self.gdrive_credentials_path, core)) self.path_info = self.path_cls(config[Config.SECTION_REMOTE_URL]) - print("!!!!!!!!!!!!!!!!! Init") self.init_drive() def init_drive(self): @@ -78,22 +74,16 @@ def list_drive_item(self, query): page_list = self.execute_request(list_request) def cache_root_dirs(self): - print("Gather cache...........................................") - self.cached_dirs = {} - self.cached_ids = {} for dir1 in self.list_drive_item( "'{}' in parents and trashed=false".format(self.root_id) ): self.cached_dirs.setdefault(dir1["title"], []).append(dir1["id"]) - print("Cashing {} with id {}".format(dir1["title"], dir1["id"])) self.cached_ids[dir1["id"]] = dir1["title"] - print("Cached root dir content: {}".format(self.cached_dirs)) @cached_property def drive(self): from pydrive.auth import GoogleAuth from pydrive.drive import GoogleDrive - import logging if os.getenv("PYDRIVE_USER_CREDENTIALS_DATA"): with open("credentials.json", "w") as credentials_file: @@ -101,11 +91,6 @@ def drive(self): os.getenv("PYDRIVE_USER_CREDENTIALS_DATA") ) - # Supress import error on GoogleAuth warning - logging.getLogger("googleapiclient.discovery_cache").setLevel( - logging.ERROR - ) - GoogleAuth.DEFAULT_SETTINGS["client_config_backend"] = "settings" gauth = GoogleAuth(settings_file=self.gdrive_credentials_path) gauth.CommandLineAuth() @@ -118,32 +103,25 @@ def create_drive_item(self, parent_id, title): "drive": self.drive, "title": title, "parent_id": parent_id, - "mime_type": self.FOLDER_MIME_TYPE, + "mime_type": FOLDER_MIME_TYPE, } ) result = self.execute_request(upload_request) return result def get_drive_item(self, name, parents_ids): - print('get_drive_item for parents_ids {}'.format(parents_ids)) query = " or ".join( - "'{}' in parents".format(parent_id) - for parent_id in parents_ids + "'{}' in parents".format(parent_id) for parent_id in parents_ids ) if not query: return query += " and trashed=false and title='{}'".format(name) - print("get_drive_item query: {}".format(query)) - list_request = RequestListFile( - self.drive, - query, - ) + list_request = RequestListFile(self.drive, query) item_list = self.execute_request(list_request) return next(iter(item_list), None) def resolve_remote_file(self, parents_ids, path_parts, create): - print("resolve remote file for {}".format(path_parts)) for path_part in path_parts: item = self.get_drive_item(path_part, parents_ids) if not item and create: @@ -157,9 +135,8 @@ def subtract_root_path(self, parts): parents_ids = [self.path_info.netloc] if not hasattr(self, "root_id"): return parts, parents_ids - + for part in self.path_info.path.split("/"): - print("subtract_root_path compare {} with {}".format(part, parts[0])) if parts and parts[0] == part: parts.pop(0) parents_ids = [self.root_id] @@ -170,21 +147,18 @@ def subtract_root_path(self, parts): def get_path_id_from_cache(self, path_info): files_ids = [] parts, parents_ids = self.subtract_root_path(path_info.path.split("/")) - print("Resolved parts: {}".format(parts)) if ( path_info != self.path_info and parts and (parts[0] in self.cached_dirs) ): parents_ids = self.cached_dirs[parts[0]] - print('Parents_ids resolved from cash for {} as {}'.format(parts[0], self.cached_dirs[parts[0]])) files_ids = self.cached_dirs[parts[0]] parts.pop(0) return files_ids, parents_ids, parts def get_path_id(self, path_info, create=False): - print("get_path_id for path {}".format(path_info)) files_ids, parents_ids, parts = self.get_path_id_from_cache(path_info) if not parts and files_ids: @@ -200,7 +174,6 @@ def _upload(self, from_file, to_info, name, no_progress_bar): dirname = to_info.parent if dirname: parent_id = self.get_path_id(dirname, True) - print("parent_id on upload resolved as: {}".format(parent_id)) else: parent_id = to_info.netloc @@ -237,7 +210,7 @@ def list_cache_paths(self): yield posixpath.join(prefix, path) def list_file_path(self, drive_file): - if drive_file["mimeType"] == self.FOLDER_MIME_TYPE: + if drive_file["mimeType"] == FOLDER_MIME_TYPE: for i in self.list_path(drive_file["id"]): yield posixpath.join(drive_file["title"], i) else: @@ -251,22 +224,14 @@ def list_path(self, parent_id): yield path def all(self): - print('All') query = " or ".join( - "'{}' in parents".format(dir_id) - for dir_id in self.cached_ids + "'{}' in parents".format(dir_id) for dir_id in self.cached_ids ) if not query: return query += " and trashed=false" - print("All query: {}".format(query)) - counter = 0 for file1 in self.list_drive_item(query): parent_id = file1["parents"][0]["id"] - print(self.cached_ids[parent_id]) - print(file1["title"]) - counter += 1 - print("{}".format(counter)) path = posixpath.join(self.cached_ids[parent_id], file1["title"]) try: yield self.path_to_checksum(path) diff --git a/dvc/remote/gdrive/pydrive.py b/dvc/remote/gdrive/pydrive.py index 812af3fcfa..972171bd1b 100644 --- a/dvc/remote/gdrive/pydrive.py +++ b/dvc/remote/gdrive/pydrive.py @@ -1,6 +1,6 @@ import os -from dvc.remote.gdrive.utils import TrackFileReadProgress +from dvc.remote.gdrive.utils import TrackFileReadProgress, FOLDER_MIME_TYPE class RequestBASE: @@ -64,7 +64,7 @@ def execute(self): "mimeType": self.mime_type, } ) - if self.mime_type == "application/vnd.google-apps.folder": + if self.mime_type == FOLDER_MIME_TYPE: item.Upload() else: self.upload(item) diff --git a/dvc/remote/gdrive/settings.yaml b/dvc/remote/gdrive/settings.yaml deleted file mode 100644 index f7d5779666..0000000000 --- a/dvc/remote/gdrive/settings.yaml +++ /dev/null @@ -1,14 +0,0 @@ -client_config_backend: settings -client_config: - client_id: 719861249063-v4an78j9grdtuuuqg3lnm0sugna6v3lh.apps.googleusercontent.com - client_secret: 2fy_HyzSwkxkGzEken7hThXb - -save_credentials: True -save_credentials_backend: file -save_credentials_file: credentials.json - -get_refresh_token: True - -oauth_scope: - - https://www.googleapis.com/auth/drive - - https://www.googleapis.com/auth/drive.appdata diff --git a/dvc/remote/gdrive/utils.py b/dvc/remote/gdrive/utils.py index 5331165640..0f3cf02cd0 100644 --- a/dvc/remote/gdrive/utils.py +++ b/dvc/remote/gdrive/utils.py @@ -1,15 +1,9 @@ -import functools import os -import threading -import logging from dvc.progress import Tqdm -LOGGER = logging.getLogger(__name__) - - -MIME_GOOGLE_APPS_FOLDER = "application/vnd.google-apps.folder" +FOLDER_MIME_TYPE = "application/vnd.google-apps.folder" class TrackFileReadProgress(object): @@ -36,33 +30,3 @@ def close(self): def __getattr__(self, attr): return getattr(self.fobj, attr) - - -def only_once(func): - lock = threading.Lock() - locks = {} - results = {} - - @functools.wraps(func) - def wrapped(*args, **kwargs): - key = (args, tuple(kwargs.items())) - # could do with just setdefault, but it would require - # create/delete a "default" Lock() object for each call, so it - # is better to lock a single one for a short time - with lock: - if key not in locks: - locks[key] = threading.Lock() - with locks[key]: - if key not in results: - results[key] = func(*args, **kwargs) - return results[key] - - return wrapped - - -@only_once -def shared_token_warning(): - LOGGER.warning( - "Warning: a shared GoogleAPI token is in use. " - "Please create your own token." - ) From b71f0b2e1a1a2889b02397f564bc9d1664cda871 Mon Sep 17 00:00:00 2001 From: Max Risuhin Date: Sat, 26 Oct 2019 07:56:35 -0700 Subject: [PATCH 26/33] Fix tests --- dvc/config.py | 2 -- dvc/remote/gdrive/__init__.py | 15 ++++++++------- tests/func/gdrive-settings.yaml | 16 ++++++++++++++++ tests/func/test_data_cloud.py | 16 ++++++++++++++++ 4 files changed, 40 insertions(+), 9 deletions(-) create mode 100644 tests/func/gdrive-settings.yaml diff --git a/dvc/config.py b/dvc/config.py index 2ba234fd0f..1c6b9b4217 100644 --- a/dvc/config.py +++ b/dvc/config.py @@ -247,8 +247,6 @@ class Config(object): # pylint: disable=too-many-instance-attributes Optional(SECTION_GCP_PROJECTNAME): str, } - SECTION_GDRIVE_CREDENTIALPATH = CREDENTIALPATH - # backward compatibility SECTION_LOCAL = "local" SECTION_LOCAL_STORAGEPATH = SECTION_AWS_STORAGEPATH diff --git a/dvc/remote/gdrive/__init__.py b/dvc/remote/gdrive/__init__.py index 91707dec9e..ed3149e5b9 100644 --- a/dvc/remote/gdrive/__init__.py +++ b/dvc/remote/gdrive/__init__.py @@ -38,18 +38,19 @@ def __init__(self, repo, config): self.no_traverse = False self.cached_dirs = {} self.cached_ids = {} - if Config.SECTION_GDRIVE_CREDENTIALPATH not in config: + self.path_info = self.path_cls(config[Config.SECTION_REMOTE_URL]) + self.config = config + self.init_drive() + + def init_drive(self): + if Config.SECTION_REMOTE_KEY_FILE not in self.config: raise DvcException( "Google Drive settings file path is missed from config. " "Learn more at https://dvc.org/doc." ) - self.gdrive_credentials_path = config.get( - Config.SECTION_GDRIVE_CREDENTIALPATH + self.gdrive_credentials_path = self.config.get( + Config.SECTION_REMOTE_KEY_FILE ) - self.path_info = self.path_cls(config[Config.SECTION_REMOTE_URL]) - self.init_drive() - - def init_drive(self): self.root_id = self.get_path_id(self.path_info, create=True) self.cache_root_dirs() diff --git a/tests/func/gdrive-settings.yaml b/tests/func/gdrive-settings.yaml new file mode 100644 index 0000000000..561694451a --- /dev/null +++ b/tests/func/gdrive-settings.yaml @@ -0,0 +1,16 @@ +client_config_backend: settings +client_config: + client_id: 719861249063-v4an78j9grdtuuuqg3lnm0sugna6v3lh.apps.googleusercontent.com + client_secret: 2fy_HyzSwkxkGzEken7hThXb + +save_credentials: True +save_credentials_backend: file +save_credentials_file: credentials.json + +get_refresh_token: True + +oauth_scope: + - https://www.googleapis.com/auth/drive + - https://www.googleapis.com/auth/drive.appdata + + diff --git a/tests/func/test_data_cloud.py b/tests/func/test_data_cloud.py index 9524ebcaa0..d0388bfc60 100644 --- a/tests/func/test_data_cloud.py +++ b/tests/func/test_data_cloud.py @@ -58,6 +58,10 @@ # Ensure that absolute path is used os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = TEST_GCP_CREDS_FILE +TEST_GDRIVE_GOOGLE_AUTH_SETTINGS_PATH = os.path.join( + os.path.dirname(__file__), "gdrive-settings.yaml" +) + def _should_test_aws(): do_test = env2bool("DVC_TEST_AWS", undefined=None) @@ -390,6 +394,9 @@ class TestRemoteGDrive(TestDataCloudBase): def _should_test(self): return _should_test_gdrive() + def _get_keyfile(self): + return TEST_GDRIVE_GOOGLE_AUTH_SETTINGS_PATH + def _get_url(self): return get_gdrive_url() @@ -651,6 +658,15 @@ def _test(self): url = get_gdrive_url() self.main(["remote", "add", TEST_REMOTE, url]) + self.main( + [ + "remote", + "modify", + TEST_REMOTE, + Config.SECTION_REMOTE_KEY_FILE, + TEST_GDRIVE_GOOGLE_AUTH_SETTINGS_PATH, + ] + ) self._test_cloud(TEST_REMOTE) From 8589dec0ae1a9b5fabe9937d7b85232985c12108 Mon Sep 17 00:00:00 2001 From: Max Risuhin Date: Sat, 26 Oct 2019 08:16:35 -0700 Subject: [PATCH 27/33] Fix DeepSource findings --- dvc/remote/gdrive/__init__.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/dvc/remote/gdrive/__init__.py b/dvc/remote/gdrive/__init__.py index ed3149e5b9..aea1521950 100644 --- a/dvc/remote/gdrive/__init__.py +++ b/dvc/remote/gdrive/__init__.py @@ -115,7 +115,7 @@ def get_drive_item(self, name, parents_ids): "'{}' in parents".format(parent_id) for parent_id in parents_ids ) if not query: - return + return None query += " and trashed=false and title='{}'".format(name) list_request = RequestListFile(self.drive, query) @@ -133,17 +133,15 @@ def resolve_remote_file(self, parents_ids, path_parts, create): return item def subtract_root_path(self, parts): - parents_ids = [self.path_info.netloc] if not hasattr(self, "root_id"): - return parts, parents_ids + return parts, [self.path_info.netloc] for part in self.path_info.path.split("/"): if parts and parts[0] == part: parts.pop(0) - parents_ids = [self.root_id] else: break - return parts, parents_ids + return parts, [self.root_id] def get_path_id_from_cache(self, path_info): files_ids = [] From 28e208c618c6c832308063e8e5027cf5a5723dff Mon Sep 17 00:00:00 2001 From: Max Risuhin Date: Sat, 2 Nov 2019 09:54:54 -0700 Subject: [PATCH 28/33] Fix code review findings --- dvc/remote/gdrive/__init__.py | 43 ++++++++++++++----------- setup.py | 2 +- tests/func/gdrive-settings.yaml | 2 -- tests/func/test_data_cloud.py | 2 +- tests/unit/remote/gdrive/test_gdrive.py | 7 ---- tests/unit/remote/gdrive/test_utils.py | 0 6 files changed, 27 insertions(+), 29 deletions(-) delete mode 100644 tests/unit/remote/gdrive/test_utils.py diff --git a/dvc/remote/gdrive/__init__.py b/dvc/remote/gdrive/__init__.py index aea1521950..53b7f4d034 100644 --- a/dvc/remote/gdrive/__init__.py +++ b/dvc/remote/gdrive/__init__.py @@ -2,9 +2,9 @@ import os import posixpath +import logging from funcy import cached_property -from ratelimit import limits, sleep_and_retry from backoff import on_exception, expo from dvc.scheme import Schemes @@ -20,6 +20,8 @@ ) from dvc.remote.gdrive.utils import FOLDER_MIME_TYPE +LOGGER = logging.getLogger(__name__) + class GDriveURLInfo(CloudURLInfo): @property @@ -32,6 +34,8 @@ class RemoteGDrive(RemoteBASE): path_cls = GDriveURLInfo REGEX = r"^gdrive://.*$" REQUIRES = {"pydrive": "pydrive"} + GDRIVE_USER_CREDENTIALS_DATA = "GDRIVE_USER_CREDENTIALS_DATA" + CREDENTIALS_FILE_PATH = "credentials.json" def __init__(self, repo, config): super(RemoteGDrive, self).__init__(repo, config) @@ -43,26 +47,26 @@ def __init__(self, repo, config): self.init_drive() def init_drive(self): - if Config.SECTION_REMOTE_KEY_FILE not in self.config: + self.gdrive_credentials_path = self.config.get( + Config.SECTION_REMOTE_KEY_FILE, None + ) + if not self.gdrive_credentials_path: raise DvcException( "Google Drive settings file path is missed from config. " - "Learn more at https://dvc.org/doc." + "Learn more at " + "https://dvc.org/doc/command-reference/remote/add." ) - self.gdrive_credentials_path = self.config.get( - Config.SECTION_REMOTE_KEY_FILE - ) self.root_id = self.get_path_id(self.path_info, create=True) self.cache_root_dirs() @on_exception(expo, DvcException, max_tries=8) - @sleep_and_retry - @limits(calls=10, period=1) def execute_request(self, request): try: result = request.execute() except Exception as exception: - if "Rate Limit Exceeded" in str(exception): - raise DvcException("API usage rate limit exceeded") + retry_codes = ["403", "500", "502", "503", "504"] + if any(code in str(exception) for code in retry_codes): + raise DvcException("Google API request failed") raise return result @@ -86,10 +90,10 @@ def drive(self): from pydrive.auth import GoogleAuth from pydrive.drive import GoogleDrive - if os.getenv("PYDRIVE_USER_CREDENTIALS_DATA"): - with open("credentials.json", "w") as credentials_file: + if os.getenv(RemoteGDrive.GDRIVE_USER_CREDENTIALS_DATA): + with open(self.CREDENTIALS_FILE_PATH, "w") as credentials_file: credentials_file.write( - os.getenv("PYDRIVE_USER_CREDENTIALS_DATA") + os.getenv(RemoteGDrive.GDRIVE_USER_CREDENTIALS_DATA) ) GoogleAuth.DEFAULT_SETTINGS["client_config_backend"] = "settings" @@ -111,11 +115,12 @@ def create_drive_item(self, parent_id, title): return result def get_drive_item(self, name, parents_ids): + if not parents_ids: + return None query = " or ".join( "'{}' in parents".format(parent_id) for parent_id in parents_ids ) - if not query: - return None + query += " and trashed=false and title='{}'".format(name) list_request = RequestListFile(self.drive, query) @@ -223,11 +228,13 @@ def list_path(self, parent_id): yield path def all(self): + if not self.cached_ids: + return + query = " or ".join( "'{}' in parents".format(dir_id) for dir_id in self.cached_ids ) - if not query: - return + query += " and trashed=false" for file1 in self.list_drive_item(query): parent_id = file1["parents"][0]["id"] @@ -236,4 +243,4 @@ def all(self): yield self.path_to_checksum(path) except ValueError: # We ignore all the non-cache looking files - pass + LOGGER.debug('Ignoring path as "non-cache looking"') diff --git a/setup.py b/setup.py index 19f523979a..31ba4deb23 100644 --- a/setup.py +++ b/setup.py @@ -86,7 +86,7 @@ def run(self): # Extra dependencies for remote integrations gs = ["google-cloud-storage==1.19.0"] -gdrive = ["pydrive==1.3.1", "ratelimit==2.2.1", "backoff>=1.8.1"] +gdrive = ["pydrive==1.3.1", "backoff>=1.8.1"] s3 = ["boto3==1.9.115"] azure = ["azure-storage-blob==2.1.0"] oss = ["oss2==2.6.1"] diff --git a/tests/func/gdrive-settings.yaml b/tests/func/gdrive-settings.yaml index 561694451a..f7d5779666 100644 --- a/tests/func/gdrive-settings.yaml +++ b/tests/func/gdrive-settings.yaml @@ -12,5 +12,3 @@ get_refresh_token: True oauth_scope: - https://www.googleapis.com/auth/drive - https://www.googleapis.com/auth/drive.appdata - - diff --git a/tests/func/test_data_cloud.py b/tests/func/test_data_cloud.py index d0388bfc60..a4f70d3ceb 100644 --- a/tests/func/test_data_cloud.py +++ b/tests/func/test_data_cloud.py @@ -75,7 +75,7 @@ def _should_test_aws(): def _should_test_gdrive(): - if os.getenv("PYDRIVE_USER_CREDENTIALS_DATA"): + if os.getenv(RemoteGDrive.GDRIVE_USER_CREDENTIALS_DATA): return True return False diff --git a/tests/unit/remote/gdrive/test_gdrive.py b/tests/unit/remote/gdrive/test_gdrive.py index 9757f03661..28e003748c 100644 --- a/tests/unit/remote/gdrive/test_gdrive.py +++ b/tests/unit/remote/gdrive/test_gdrive.py @@ -7,10 +7,3 @@ def test_init_drive(repo): url = "gdrive://root/data" gdrive = RemoteGDrive(repo, {"url": url}) assert str(gdrive.path_info) == url - - -@mock.patch("dvc.remote.gdrive.RemoteGDrive.init_drive") -def test_init_folder_id(repo): - url = "gdrive://folder_id/data" - gdrive = RemoteGDrive(repo, {"url": url}) - assert str(gdrive.path_info) == url diff --git a/tests/unit/remote/gdrive/test_utils.py b/tests/unit/remote/gdrive/test_utils.py deleted file mode 100644 index e69de29bb2..0000000000 From 0afd64c26897b178ae9fa5a7481ed7ce2639ffd7 Mon Sep 17 00:00:00 2001 From: Max Risuhin Date: Sun, 3 Nov 2019 12:40:49 -0800 Subject: [PATCH 29/33] Fix more code review findings --- dvc/remote/gdrive/__init__.py | 33 ++++++++++++++----------------- dvc/remote/gdrive/pydrive.py | 37 +++++++++++++++++++++++------------ 2 files changed, 39 insertions(+), 31 deletions(-) diff --git a/dvc/remote/gdrive/__init__.py b/dvc/remote/gdrive/__init__.py index 53b7f4d034..0e33db1741 100644 --- a/dvc/remote/gdrive/__init__.py +++ b/dvc/remote/gdrive/__init__.py @@ -15,12 +15,13 @@ from dvc.remote.gdrive.pydrive import ( RequestListFile, RequestListFilePaginated, + RequestCreateFolder, RequestUploadFile, RequestDownloadFile, ) from dvc.remote.gdrive.utils import FOLDER_MIME_TYPE -LOGGER = logging.getLogger(__name__) +logger = logging.getLogger(__name__) class GDriveURLInfo(CloudURLInfo): @@ -40,8 +41,6 @@ class RemoteGDrive(RemoteBASE): def __init__(self, repo, config): super(RemoteGDrive, self).__init__(repo, config) self.no_traverse = False - self.cached_dirs = {} - self.cached_ids = {} self.path_info = self.path_cls(config[Config.SECTION_REMOTE_URL]) self.config = config self.init_drive() @@ -54,10 +53,10 @@ def init_drive(self): raise DvcException( "Google Drive settings file path is missed from config. " "Learn more at " - "https://dvc.org/doc/command-reference/remote/add." + "https://man.dvc.org/remote/add." ) self.root_id = self.get_path_id(self.path_info, create=True) - self.cache_root_dirs() + self.cached_dirs, self.cached_ids = self.cache_root_dirs() @on_exception(expo, DvcException, max_tries=8) def execute_request(self, request): @@ -79,11 +78,14 @@ def list_drive_item(self, query): page_list = self.execute_request(list_request) def cache_root_dirs(self): + cached_dirs = {} + cached_ids = {} for dir1 in self.list_drive_item( "'{}' in parents and trashed=false".format(self.root_id) ): - self.cached_dirs.setdefault(dir1["title"], []).append(dir1["id"]) - self.cached_ids[dir1["id"]] = dir1["title"] + cached_dirs.setdefault(dir1["title"], []).append(dir1["id"]) + cached_ids[dir1["id"]] = dir1["title"] + return cached_dirs, cached_ids @cached_property def drive(self): @@ -103,13 +105,8 @@ def drive(self): return gdrive def create_drive_item(self, parent_id, title): - upload_request = RequestUploadFile( - { - "drive": self.drive, - "title": title, - "parent_id": parent_id, - "mime_type": FOLDER_MIME_TYPE, - } + upload_request = RequestCreateFolder( + {"drive": self.drive, "title": title, "parent_id": parent_id} ) result = self.execute_request(upload_request) return result @@ -152,7 +149,8 @@ def get_path_id_from_cache(self, path_info): files_ids = [] parts, parents_ids = self.subtract_root_path(path_info.path.split("/")) if ( - path_info != self.path_info + hasattr(self, "cached_dirs") + and path_info != self.path_info and parts and (parts[0] in self.cached_dirs) ): @@ -186,7 +184,6 @@ def _upload(self, from_file, to_info, name, no_progress_bar): "drive": self.drive, "title": to_info.name, "parent_id": parent_id, - "mime_type": "", }, no_progress_bar, from_file, @@ -228,7 +225,7 @@ def list_path(self, parent_id): yield path def all(self): - if not self.cached_ids: + if not hasattr(self, "cached_ids") or not self.cached_ids: return query = " or ".join( @@ -243,4 +240,4 @@ def all(self): yield self.path_to_checksum(path) except ValueError: # We ignore all the non-cache looking files - LOGGER.debug('Ignoring path as "non-cache looking"') + logger.debug('Ignoring path as "non-cache looking"') diff --git a/dvc/remote/gdrive/pydrive.py b/dvc/remote/gdrive/pydrive.py index 972171bd1b..46836d04fc 100644 --- a/dvc/remote/gdrive/pydrive.py +++ b/dvc/remote/gdrive/pydrive.py @@ -36,6 +36,24 @@ def execute(self): return next(self.iter, None) +class RequestCreateFolder(RequestBASE): + def __init__(self, args): + super(RequestCreateFolder, self).__init__(args["drive"]) + self.title = args["title"] + self.parent_id = args["parent_id"] + + def execute(self): + item = self.drive.CreateFile( + { + "title": self.title, + "parents": [{"id": self.parent_id}], + "mimeType": FOLDER_MIME_TYPE, + } + ) + item.Upload() + return item + + class RequestUploadFile(RequestBASE): def __init__( self, args, no_progress_bar=True, from_file="", progress_name="" @@ -43,32 +61,25 @@ def __init__( super(RequestUploadFile, self).__init__(args["drive"]) self.title = args["title"] self.parent_id = args["parent_id"] - self.mime_type = args["mime_type"] self.no_progress_bar = no_progress_bar self.from_file = from_file - self.proress_name = progress_name + self.progress_name = progress_name def upload(self, item): with open(self.from_file, "rb") as from_file: if not self.no_progress_bar: - from_file = TrackFileReadProgress(self.proress_name, from_file) + from_file = TrackFileReadProgress( + self.progress_name, from_file + ) if os.stat(self.from_file).st_size: item.content = from_file item.Upload() def execute(self): item = self.drive.CreateFile( - { - "title": self.title, - "parents": [{"id": self.parent_id}], - "mimeType": self.mime_type, - } + {"title": self.title, "parents": [{"id": self.parent_id}]} ) - if self.mime_type == FOLDER_MIME_TYPE: - item.Upload() - else: - self.upload(item) - + self.upload(item) return item From f397ca8fd2ea60c485d1d084e8b2bbfa9f769052 Mon Sep 17 00:00:00 2001 From: Max Risuhin Date: Mon, 11 Nov 2019 06:45:29 -0800 Subject: [PATCH 30/33] GDrive settings via DVC config --- dvc/config.py | 7 +++++ dvc/remote/gdrive/__init__.py | 50 +++++++++++++++++++++++++++------ tests/func/gdrive-settings.yaml | 14 --------- tests/func/test_data_cloud.py | 37 ++++++++++++++++++++---- 4 files changed, 80 insertions(+), 28 deletions(-) delete mode 100644 tests/func/gdrive-settings.yaml diff --git a/dvc/config.py b/dvc/config.py index 1c6b9b4217..7f3d747817 100644 --- a/dvc/config.py +++ b/dvc/config.py @@ -257,6 +257,10 @@ class Config(object): # pylint: disable=too-many-instance-attributes SECTION_OSS_ACCESS_KEY_ID = "oss_key_id" SECTION_OSS_ACCESS_KEY_SECRET = "oss_key_secret" SECTION_OSS_ENDPOINT = "oss_endpoint" + # GDrive options + SECTION_GDRIVE_CLIENT_ID = "gdrive_client_id" + SECTION_GDRIVE_CLIENT_SECRET = "gdrive_client_secret" + SECTION_GDRIVE_USER_CREDENTIALS_FILE = "gdrive_user_credentials_file" SECTION_REMOTE_REGEX = r'^\s*remote\s*"(?P.*)"\s*$' SECTION_REMOTE_FMT = 'remote "{}"' @@ -293,6 +297,9 @@ class Config(object): # pylint: disable=too-many-instance-attributes Optional(SECTION_OSS_ACCESS_KEY_ID): str, Optional(SECTION_OSS_ACCESS_KEY_SECRET): str, Optional(SECTION_OSS_ENDPOINT): str, + Optional(SECTION_GDRIVE_CLIENT_ID): str, + Optional(SECTION_GDRIVE_CLIENT_SECRET): str, + Optional(SECTION_GDRIVE_USER_CREDENTIALS_FILE): str, Optional(PRIVATE_CWD): str, Optional(SECTION_REMOTE_NO_TRAVERSE, default=True): BOOL_SCHEMA, } diff --git a/dvc/remote/gdrive/__init__.py b/dvc/remote/gdrive/__init__.py index 0e33db1741..8828e611d9 100644 --- a/dvc/remote/gdrive/__init__.py +++ b/dvc/remote/gdrive/__init__.py @@ -36,7 +36,7 @@ class RemoteGDrive(RemoteBASE): REGEX = r"^gdrive://.*$" REQUIRES = {"pydrive": "pydrive"} GDRIVE_USER_CREDENTIALS_DATA = "GDRIVE_USER_CREDENTIALS_DATA" - CREDENTIALS_FILE_PATH = "credentials.json" + DEFAULT_USER_CREDENTIALS_FILE = ".dvc/tmp/gdrive-user-credentials.json" def __init__(self, repo, config): super(RemoteGDrive, self).__init__(repo, config) @@ -46,15 +46,23 @@ def __init__(self, repo, config): self.init_drive() def init_drive(self): - self.gdrive_credentials_path = self.config.get( - Config.SECTION_REMOTE_KEY_FILE, None + self.gdrive_client_id = self.config.get( + Config.SECTION_GDRIVE_CLIENT_ID, None ) - if not self.gdrive_credentials_path: + self.gdrive_client_secret = self.config.get( + Config.SECTION_GDRIVE_CLIENT_SECRET, None + ) + if not self.gdrive_client_id or not self.gdrive_client_secret: raise DvcException( - "Google Drive settings file path is missed from config. " - "Learn more at " + "Please specify Google Drive's client id and " + "secret in DVC's config. Learn more at " "https://man.dvc.org/remote/add." ) + self.gdrive_user_credentials_path = self.config.get( + Config.SECTION_GDRIVE_USER_CREDENTIALS_FILE, + self.DEFAULT_USER_CREDENTIALS_FILE, + ) + self.root_id = self.get_path_id(self.path_info, create=True) self.cached_dirs, self.cached_ids = self.cache_root_dirs() @@ -93,15 +101,41 @@ def drive(self): from pydrive.drive import GoogleDrive if os.getenv(RemoteGDrive.GDRIVE_USER_CREDENTIALS_DATA): - with open(self.CREDENTIALS_FILE_PATH, "w") as credentials_file: + with open( + self.gdrive_user_credentials_path, "w" + ) as credentials_file: credentials_file.write( os.getenv(RemoteGDrive.GDRIVE_USER_CREDENTIALS_DATA) ) GoogleAuth.DEFAULT_SETTINGS["client_config_backend"] = "settings" - gauth = GoogleAuth(settings_file=self.gdrive_credentials_path) + GoogleAuth.DEFAULT_SETTINGS["client_config"] = { + "client_id": self.gdrive_client_id, + "client_secret": self.gdrive_client_secret, + "auth_uri": "https://accounts.google.com/o/oauth2/auth", + "token_uri": "https://oauth2.googleapis.com/token", + "revoke_uri": "https://oauth2.googleapis.com/revoke", + "redirect_uri": "", + } + GoogleAuth.DEFAULT_SETTINGS["save_credentials"] = True + GoogleAuth.DEFAULT_SETTINGS["save_credentials_backend"] = "file" + GoogleAuth.DEFAULT_SETTINGS[ + "save_credentials_file" + ] = self.gdrive_user_credentials_path + GoogleAuth.DEFAULT_SETTINGS["get_refresh_token"] = True + GoogleAuth.DEFAULT_SETTINGS["oauth_scope"] = [ + "https://www.googleapis.com/auth/drive", + "https://www.googleapis.com/auth/drive.appdata", + ] + + # Pass non existent settings path to force DEFAULT_SETTINGS loading + gauth = GoogleAuth(settings_file="") gauth.CommandLineAuth() gdrive = GoogleDrive(gauth) + + if os.getenv(RemoteGDrive.GDRIVE_USER_CREDENTIALS_DATA): + os.remove(self.gdrive_user_credentials_path) + return gdrive def create_drive_item(self, parent_id, title): diff --git a/tests/func/gdrive-settings.yaml b/tests/func/gdrive-settings.yaml deleted file mode 100644 index f7d5779666..0000000000 --- a/tests/func/gdrive-settings.yaml +++ /dev/null @@ -1,14 +0,0 @@ -client_config_backend: settings -client_config: - client_id: 719861249063-v4an78j9grdtuuuqg3lnm0sugna6v3lh.apps.googleusercontent.com - client_secret: 2fy_HyzSwkxkGzEken7hThXb - -save_credentials: True -save_credentials_backend: file -save_credentials_file: credentials.json - -get_refresh_token: True - -oauth_scope: - - https://www.googleapis.com/auth/drive - - https://www.googleapis.com/auth/drive.appdata diff --git a/tests/func/test_data_cloud.py b/tests/func/test_data_cloud.py index a4f70d3ceb..9b880ca61d 100644 --- a/tests/func/test_data_cloud.py +++ b/tests/func/test_data_cloud.py @@ -58,9 +58,10 @@ # Ensure that absolute path is used os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = TEST_GCP_CREDS_FILE -TEST_GDRIVE_GOOGLE_AUTH_SETTINGS_PATH = os.path.join( - os.path.dirname(__file__), "gdrive-settings.yaml" +TEST_GDRIVE_CLIENT_ID = ( + "719861249063-v4an78j9grdtuuuqg3lnm0sugna6v3lh.apps.googleusercontent.com" ) +TEST_GDRIVE_CLIENT_SECRET = "2fy_HyzSwkxkGzEken7hThXb" def _should_test_aws(): @@ -394,8 +395,23 @@ class TestRemoteGDrive(TestDataCloudBase): def _should_test(self): return _should_test_gdrive() - def _get_keyfile(self): - return TEST_GDRIVE_GOOGLE_AUTH_SETTINGS_PATH + def _setup_cloud(self): + self._ensure_should_run() + + repo = self._get_url() + + config = copy.deepcopy(TEST_CONFIG) + config[TEST_SECTION][Config.SECTION_REMOTE_URL] = repo + config[TEST_SECTION][ + Config.SECTION_GDRIVE_CLIENT_ID + ] = TEST_GDRIVE_CLIENT_ID + config[TEST_SECTION][ + Config.SECTION_GDRIVE_CLIENT_SECRET + ] = TEST_GDRIVE_CLIENT_SECRET + self.dvc.config.config = config + self.cloud = DataCloud(self.dvc) + + self.assertIsInstance(self.cloud.get_remote(), self._get_cloud_class()) def _get_url(self): return get_gdrive_url() @@ -663,8 +679,17 @@ def _test(self): "remote", "modify", TEST_REMOTE, - Config.SECTION_REMOTE_KEY_FILE, - TEST_GDRIVE_GOOGLE_AUTH_SETTINGS_PATH, + Config.SECTION_GDRIVE_CLIENT_ID, + TEST_GDRIVE_CLIENT_ID, + ] + ) + self.main( + [ + "remote", + "modify", + TEST_REMOTE, + Config.SECTION_GDRIVE_CLIENT_SECRET, + TEST_GDRIVE_CLIENT_SECRET, ] ) From 901adbf5c8e8b066a0f4698cf3161617045d4b7b Mon Sep 17 00:00:00 2001 From: Max Risuhin Date: Mon, 11 Nov 2019 07:09:45 -0800 Subject: [PATCH 31/33] Move credentials file erasing to earlier stage just after auth is completed --- dvc/remote/gdrive/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dvc/remote/gdrive/__init__.py b/dvc/remote/gdrive/__init__.py index 8828e611d9..567c36158e 100644 --- a/dvc/remote/gdrive/__init__.py +++ b/dvc/remote/gdrive/__init__.py @@ -131,11 +131,11 @@ def drive(self): # Pass non existent settings path to force DEFAULT_SETTINGS loading gauth = GoogleAuth(settings_file="") gauth.CommandLineAuth() - gdrive = GoogleDrive(gauth) if os.getenv(RemoteGDrive.GDRIVE_USER_CREDENTIALS_DATA): os.remove(self.gdrive_user_credentials_path) + gdrive = GoogleDrive(gauth) return gdrive def create_drive_item(self, parent_id, title): From db8e7eddaea93a8fef342ebf631615449f9a12e4 Mon Sep 17 00:00:00 2001 From: "Restyled.io" Date: Mon, 11 Nov 2019 15:10:19 +0000 Subject: [PATCH 32/33] Restyled by reorder-python-imports --- dvc/config.py | 22 ++++++---- dvc/remote/__init__.py | 9 ++-- dvc/remote/gdrive/__init__.py | 23 +++++------ dvc/remote/gdrive/pydrive.py | 3 +- setup.py | 8 ++-- tests/func/test_data_cloud.py | 55 +++++++++++++------------ tests/unit/remote/gdrive/test_gdrive.py | 1 + 7 files changed, 65 insertions(+), 56 deletions(-) diff --git a/dvc/config.py b/dvc/config.py index 7f3d747817..f3621d907b 100644 --- a/dvc/config.py +++ b/dvc/config.py @@ -1,18 +1,24 @@ """DVC config objects.""" - from __future__ import unicode_literals -from dvc.utils.compat import str, open - -import os -import re import copy import errno -import configobj import logging +import os +import re -from schema import Schema, Optional, And, Use, Regex, SchemaError -from dvc.exceptions import DvcException, NotDvcRepoError +import configobj +from schema import And +from schema import Optional +from schema import Regex +from schema import Schema +from schema import SchemaError +from schema import Use + +from dvc.exceptions import DvcException +from dvc.exceptions import NotDvcRepoError +from dvc.utils.compat import open +from dvc.utils.compat import str logger = logging.getLogger(__name__) diff --git a/dvc/remote/__init__.py b/dvc/remote/__init__.py index f14ed9f5d4..e2c20a2168 100644 --- a/dvc/remote/__init__.py +++ b/dvc/remote/__init__.py @@ -1,17 +1,16 @@ from __future__ import unicode_literals +from .config import RemoteConfig from dvc.remote.azure import RemoteAZURE from dvc.remote.gdrive import RemoteGDrive from dvc.remote.gs import RemoteGS from dvc.remote.hdfs import RemoteHDFS -from dvc.remote.local import RemoteLOCAL -from dvc.remote.s3 import RemoteS3 -from dvc.remote.ssh import RemoteSSH from dvc.remote.http import RemoteHTTP from dvc.remote.https import RemoteHTTPS +from dvc.remote.local import RemoteLOCAL from dvc.remote.oss import RemoteOSS - -from .config import RemoteConfig +from dvc.remote.s3 import RemoteS3 +from dvc.remote.ssh import RemoteSSH REMOTES = [ diff --git a/dvc/remote/gdrive/__init__.py b/dvc/remote/gdrive/__init__.py index 567c36158e..ae93969607 100644 --- a/dvc/remote/gdrive/__init__.py +++ b/dvc/remote/gdrive/__init__.py @@ -1,25 +1,24 @@ from __future__ import unicode_literals +import logging import os import posixpath -import logging +from backoff import expo +from backoff import on_exception from funcy import cached_property -from backoff import on_exception, expo -from dvc.scheme import Schemes -from dvc.path_info import CloudURLInfo -from dvc.remote.base import RemoteBASE from dvc.config import Config from dvc.exceptions import DvcException -from dvc.remote.gdrive.pydrive import ( - RequestListFile, - RequestListFilePaginated, - RequestCreateFolder, - RequestUploadFile, - RequestDownloadFile, -) +from dvc.path_info import CloudURLInfo +from dvc.remote.base import RemoteBASE +from dvc.remote.gdrive.pydrive import RequestCreateFolder +from dvc.remote.gdrive.pydrive import RequestDownloadFile +from dvc.remote.gdrive.pydrive import RequestListFile +from dvc.remote.gdrive.pydrive import RequestListFilePaginated +from dvc.remote.gdrive.pydrive import RequestUploadFile from dvc.remote.gdrive.utils import FOLDER_MIME_TYPE +from dvc.scheme import Schemes logger = logging.getLogger(__name__) diff --git a/dvc/remote/gdrive/pydrive.py b/dvc/remote/gdrive/pydrive.py index 46836d04fc..3e54179ea1 100644 --- a/dvc/remote/gdrive/pydrive.py +++ b/dvc/remote/gdrive/pydrive.py @@ -1,6 +1,7 @@ import os -from dvc.remote.gdrive.utils import TrackFileReadProgress, FOLDER_MIME_TYPE +from dvc.remote.gdrive.utils import FOLDER_MIME_TYPE +from dvc.remote.gdrive.utils import TrackFileReadProgress class RequestBASE: diff --git a/setup.py b/setup.py index 31ba4deb23..c6f5b39f2e 100644 --- a/setup.py +++ b/setup.py @@ -1,12 +1,14 @@ -from setuptools import setup, find_packages -from setuptools.command.build_py import build_py as _build_py import os import sys +from setuptools import find_packages +from setuptools import setup +from setuptools.command.build_py import build_py as _build_py + +import fastentrypoints # noqa: F401 # Prevents pkg_resources import in entry point script, # see https://github.com/ninjaaron/fast-entry_points. # This saves about 200 ms on startup time for non-wheel installs. -import fastentrypoints # noqa: F401 # https://packaging.python.org/guides/single-sourcing-package-version/ diff --git a/tests/func/test_data_cloud.py b/tests/func/test_data_cloud.py index 9b880ca61d..ad0d33020b 100644 --- a/tests/func/test_data_cloud.py +++ b/tests/func/test_data_cloud.py @@ -1,38 +1,39 @@ -from subprocess import CalledProcessError -from subprocess import check_output, Popen -from unittest import SkipTest -import os -import uuid -import shutil -import getpass -import platform import copy +import getpass import logging -import pytest +import os +import platform +import shutil +import uuid +from subprocess import CalledProcessError +from subprocess import check_output +from subprocess import Popen +from unittest import SkipTest +import pytest from mock import patch -from dvc.utils.compat import str -from dvc.utils import env2bool -from dvc.main import main -from dvc.config import Config from dvc.cache import NamedCache +from dvc.config import Config from dvc.data_cloud import DataCloud -from dvc.remote import ( - RemoteS3, - RemoteGS, - RemoteGDrive, - RemoteAZURE, - RemoteOSS, - RemoteLOCAL, - RemoteSSH, - RemoteHDFS, - RemoteHTTP, -) -from dvc.remote.base import STATUS_OK, STATUS_NEW, STATUS_DELETED +from dvc.main import main +from dvc.remote import RemoteAZURE +from dvc.remote import RemoteGDrive +from dvc.remote import RemoteGS +from dvc.remote import RemoteHDFS +from dvc.remote import RemoteHTTP +from dvc.remote import RemoteLOCAL +from dvc.remote import RemoteOSS +from dvc.remote import RemoteS3 +from dvc.remote import RemoteSSH +from dvc.remote.base import STATUS_DELETED +from dvc.remote.base import STATUS_NEW +from dvc.remote.base import STATUS_OK +from dvc.utils import env2bool from dvc.utils import file_md5 -from dvc.utils.stage import load_stage_file, dump_stage_file - +from dvc.utils.compat import str +from dvc.utils.stage import dump_stage_file +from dvc.utils.stage import load_stage_file from tests.basic_env import TestDvc from tests.utils import spy diff --git a/tests/unit/remote/gdrive/test_gdrive.py b/tests/unit/remote/gdrive/test_gdrive.py index 28e003748c..012adf12c9 100644 --- a/tests/unit/remote/gdrive/test_gdrive.py +++ b/tests/unit/remote/gdrive/test_gdrive.py @@ -1,4 +1,5 @@ import mock + from dvc.remote.gdrive import RemoteGDrive From 1b4149cbac977aff6076d17e72d318d9eaec1e2b Mon Sep 17 00:00:00 2001 From: "Restyled.io" Date: Mon, 11 Nov 2019 15:10:21 +0000 Subject: [PATCH 33/33] Restyled by yapf --- dvc/config.py | 82 ++++++++------------ dvc/remote/__init__.py | 1 - dvc/remote/gdrive/__init__.py | 78 ++++++++----------- dvc/remote/gdrive/pydrive.py | 54 +++++++------ dvc/remote/gdrive/utils.py | 1 - setup.py | 1 - tests/func/test_data_cloud.py | 141 ++++++++++++++-------------------- 7 files changed, 157 insertions(+), 201 deletions(-) diff --git a/dvc/config.py b/dvc/config.py index f3621d907b..3d500b685d 100644 --- a/dvc/config.py +++ b/dvc/config.py @@ -32,19 +32,16 @@ class ConfigError(DvcException): """ def __init__(self, msg, cause=None): - super(ConfigError, self).__init__( - "config file error: {}".format(msg), cause=cause - ) + super(ConfigError, self).__init__("config file error: {}".format(msg), + cause=cause) class NoRemoteError(ConfigError): def __init__(self, command, cause=None): - msg = ( - "no remote specified. Setup default remote with\n" - " dvc config core.remote \n" - "or use:\n" - " dvc {} -r \n".format(command) - ) + msg = ("no remote specified. Setup default remote with\n" + " dvc config core.remote \n" + "or use:\n" + " dvc {} -r \n".format(command)) super(NoRemoteError, self).__init__(msg, cause=cause) @@ -165,8 +162,7 @@ class Config(object): # pylint: disable=too-many-instance-attributes SECTION_CORE = "core" SECTION_CORE_LOGLEVEL = "loglevel" SECTION_CORE_LOGLEVEL_SCHEMA = And( - Use(str.lower), Choices("info", "debug", "warning", "error") - ) + Use(str.lower), Choices("info", "debug", "warning", "error")) SECTION_CORE_REMOTE = "remote" SECTION_CORE_INTERACTIVE_SCHEMA = BOOL_SCHEMA SECTION_CORE_INTERACTIVE = "interactive" @@ -205,19 +201,16 @@ class Config(object): # pylint: disable=too-many-instance-attributes } SECTION_CORE_SCHEMA = { - Optional(SECTION_CORE_LOGLEVEL): And( - str, Use(str.lower), SECTION_CORE_LOGLEVEL_SCHEMA - ), - Optional(SECTION_CORE_REMOTE, default=""): And(str, Use(str.lower)), - Optional( - SECTION_CORE_INTERACTIVE, default=False - ): SECTION_CORE_INTERACTIVE_SCHEMA, - Optional( - SECTION_CORE_ANALYTICS, default=True - ): SECTION_CORE_ANALYTICS_SCHEMA, - Optional( - SECTION_CORE_CHECKSUM_JOBS, default=None - ): SECTION_CORE_CHECKSUM_JOBS_SCHEMA, + Optional(SECTION_CORE_LOGLEVEL): + And(str, Use(str.lower), SECTION_CORE_LOGLEVEL_SCHEMA), + Optional(SECTION_CORE_REMOTE, default=""): + And(str, Use(str.lower)), + Optional(SECTION_CORE_INTERACTIVE, default=False): + SECTION_CORE_INTERACTIVE_SCHEMA, + Optional(SECTION_CORE_ANALYTICS, default=True): + SECTION_CORE_ANALYTICS_SCHEMA, + Optional(SECTION_CORE_CHECKSUM_JOBS, default=None): + SECTION_CORE_CHECKSUM_JOBS_SCHEMA, } # backward compatibility @@ -354,9 +347,8 @@ def get_global_config_dir(): """ from appdirs import user_config_dir - return user_config_dir( - appname=Config.APPNAME, appauthor=Config.APPAUTHOR - ) + return user_config_dir(appname=Config.APPNAME, + appauthor=Config.APPAUTHOR) @staticmethod def get_system_config_dir(): @@ -367,9 +359,8 @@ def get_system_config_dir(): """ from appdirs import site_config_dir - return site_config_dir( - appname=Config.APPNAME, appauthor=Config.APPAUTHOR - ) + return site_config_dir(appname=Config.APPNAME, + appauthor=Config.APPAUTHOR) @staticmethod def init(dvc_dir): @@ -412,13 +403,11 @@ def _resolve_paths(self, config): return ret def _load_configs(self): - system_config_file = os.path.join( - self.get_system_config_dir(), self.CONFIG - ) + system_config_file = os.path.join(self.get_system_config_dir(), + self.CONFIG) - global_config_file = os.path.join( - self.get_global_config_dir(), self.CONFIG - ) + global_config_file = os.path.join(self.get_global_config_dir(), + self.CONFIG) self._system_config = configobj.ConfigObj(system_config_file) self._global_config = configobj.ConfigObj(global_config_file) @@ -452,10 +441,10 @@ def load(self): self.config = configobj.ConfigObj() for c in [ - self._system_config, - self._global_config, - self._repo_config, - self._local_config, + self._system_config, + self._global_config, + self._repo_config, + self._local_config, ]: c = self._resolve_paths(c) c = self._lower(c) @@ -531,9 +520,8 @@ def unset(self, section, opt=None, level=None, force=False): if opt not in config[section].keys(): if force: return - raise ConfigError( - "option '{}.{}' doesn't exist".format(section, opt) - ) + raise ConfigError("option '{}.{}' doesn't exist".format( + section, opt)) del config[section][opt] if not config[section]: @@ -566,8 +554,7 @@ def set(self, section, opt, value, level=None, force=True): elif not force: raise ConfigError( "Section '{}' already exists. Use `-f|--force` to overwrite " - "section with new value.".format(section) - ) + "section with new value.".format(section)) config[section][opt] = value self.save(config) @@ -589,9 +576,8 @@ def get(self, section, opt=None, level=None): raise ConfigError("section '{}' doesn't exist".format(section)) if opt not in config[section].keys(): - raise ConfigError( - "option '{}.{}' doesn't exist".format(section, opt) - ) + raise ConfigError("option '{}.{}' doesn't exist".format( + section, opt)) return config[section][opt] diff --git a/dvc/remote/__init__.py b/dvc/remote/__init__.py index e2c20a2168..c70b09ec69 100644 --- a/dvc/remote/__init__.py +++ b/dvc/remote/__init__.py @@ -12,7 +12,6 @@ from dvc.remote.s3 import RemoteS3 from dvc.remote.ssh import RemoteSSH - REMOTES = [ RemoteAZURE, RemoteGDrive, diff --git a/dvc/remote/gdrive/__init__.py b/dvc/remote/gdrive/__init__.py index ae93969607..95ef42999c 100644 --- a/dvc/remote/gdrive/__init__.py +++ b/dvc/remote/gdrive/__init__.py @@ -46,17 +46,13 @@ def __init__(self, repo, config): def init_drive(self): self.gdrive_client_id = self.config.get( - Config.SECTION_GDRIVE_CLIENT_ID, None - ) + Config.SECTION_GDRIVE_CLIENT_ID, None) self.gdrive_client_secret = self.config.get( - Config.SECTION_GDRIVE_CLIENT_SECRET, None - ) + Config.SECTION_GDRIVE_CLIENT_SECRET, None) if not self.gdrive_client_id or not self.gdrive_client_secret: - raise DvcException( - "Please specify Google Drive's client id and " - "secret in DVC's config. Learn more at " - "https://man.dvc.org/remote/add." - ) + raise DvcException("Please specify Google Drive's client id and " + "secret in DVC's config. Learn more at " + "https://man.dvc.org/remote/add.") self.gdrive_user_credentials_path = self.config.get( Config.SECTION_GDRIVE_USER_CREDENTIALS_FILE, self.DEFAULT_USER_CREDENTIALS_FILE, @@ -88,8 +84,7 @@ def cache_root_dirs(self): cached_dirs = {} cached_ids = {} for dir1 in self.list_drive_item( - "'{}' in parents and trashed=false".format(self.root_id) - ): + "'{}' in parents and trashed=false".format(self.root_id)): cached_dirs.setdefault(dir1["title"], []).append(dir1["id"]) cached_ids[dir1["id"]] = dir1["title"] return cached_dirs, cached_ids @@ -100,12 +95,10 @@ def drive(self): from pydrive.drive import GoogleDrive if os.getenv(RemoteGDrive.GDRIVE_USER_CREDENTIALS_DATA): - with open( - self.gdrive_user_credentials_path, "w" - ) as credentials_file: + with open(self.gdrive_user_credentials_path, + "w") as credentials_file: credentials_file.write( - os.getenv(RemoteGDrive.GDRIVE_USER_CREDENTIALS_DATA) - ) + os.getenv(RemoteGDrive.GDRIVE_USER_CREDENTIALS_DATA)) GoogleAuth.DEFAULT_SETTINGS["client_config_backend"] = "settings" GoogleAuth.DEFAULT_SETTINGS["client_config"] = { @@ -119,8 +112,7 @@ def drive(self): GoogleAuth.DEFAULT_SETTINGS["save_credentials"] = True GoogleAuth.DEFAULT_SETTINGS["save_credentials_backend"] = "file" GoogleAuth.DEFAULT_SETTINGS[ - "save_credentials_file" - ] = self.gdrive_user_credentials_path + "save_credentials_file"] = self.gdrive_user_credentials_path GoogleAuth.DEFAULT_SETTINGS["get_refresh_token"] = True GoogleAuth.DEFAULT_SETTINGS["oauth_scope"] = [ "https://www.googleapis.com/auth/drive", @@ -138,18 +130,19 @@ def drive(self): return gdrive def create_drive_item(self, parent_id, title): - upload_request = RequestCreateFolder( - {"drive": self.drive, "title": title, "parent_id": parent_id} - ) + upload_request = RequestCreateFolder({ + "drive": self.drive, + "title": title, + "parent_id": parent_id + }) result = self.execute_request(upload_request) return result def get_drive_item(self, name, parents_ids): if not parents_ids: return None - query = " or ".join( - "'{}' in parents".format(parent_id) for parent_id in parents_ids - ) + query = " or ".join("'{}' in parents".format(parent_id) + for parent_id in parents_ids) query += " and trashed=false and title='{}'".format(name) @@ -181,12 +174,8 @@ def subtract_root_path(self, parts): def get_path_id_from_cache(self, path_info): files_ids = [] parts, parents_ids = self.subtract_root_path(path_info.path.split("/")) - if ( - hasattr(self, "cached_dirs") - and path_info != self.path_info - and parts - and (parts[0] in self.cached_dirs) - ): + if (hasattr(self, "cached_dirs") and path_info != self.path_info + and parts and (parts[0] in self.cached_dirs)): parents_ids = self.cached_dirs[parts[0]] files_ids = self.cached_dirs[parts[0]] parts.pop(0) @@ -226,15 +215,18 @@ def _upload(self, from_file, to_info, name, no_progress_bar): def _download(self, from_info, to_file, name, no_progress_bar): file_id = self.get_path_id(from_info) - download_request = RequestDownloadFile( - { - "drive": self.drive, - "file_id": file_id, - "to_file": to_file, - "progress_name": name, - "no_progress_bar": no_progress_bar, - } - ) + download_request = RequestDownloadFile({ + "drive": + self.drive, + "file_id": + file_id, + "to_file": + to_file, + "progress_name": + name, + "no_progress_bar": + no_progress_bar, + }) self.execute_request(download_request) def list_cache_paths(self): @@ -252,8 +244,7 @@ def list_file_path(self, drive_file): def list_path(self, parent_id): for file1 in self.list_drive_item( - "'{}' in parents and trashed=false".format(parent_id) - ): + "'{}' in parents and trashed=false".format(parent_id)): for path in self.list_file_path(file1): yield path @@ -261,9 +252,8 @@ def all(self): if not hasattr(self, "cached_ids") or not self.cached_ids: return - query = " or ".join( - "'{}' in parents".format(dir_id) for dir_id in self.cached_ids - ) + query = " or ".join("'{}' in parents".format(dir_id) + for dir_id in self.cached_ids) query += " and trashed=false" for file1 in self.list_drive_item(query): diff --git a/dvc/remote/gdrive/pydrive.py b/dvc/remote/gdrive/pydrive.py index 3e54179ea1..5c8595c56e 100644 --- a/dvc/remote/gdrive/pydrive.py +++ b/dvc/remote/gdrive/pydrive.py @@ -18,9 +18,10 @@ def __init__(self, drive, query): self.query = query def execute(self): - return self.drive.ListFile( - {"q": self.query, "maxResults": 1000} - ).GetList() + return self.drive.ListFile({ + "q": self.query, + "maxResults": 1000 + }).GetList() class RequestListFilePaginated(RequestBASE): @@ -32,8 +33,10 @@ def __init__(self, drive, query): def execute(self): if not self.iter: self.iter = iter( - self.drive.ListFile({"q": self.query, "maxResults": 1000}) - ) + self.drive.ListFile({ + "q": self.query, + "maxResults": 1000 + })) return next(self.iter, None) @@ -44,21 +47,23 @@ def __init__(self, args): self.parent_id = args["parent_id"] def execute(self): - item = self.drive.CreateFile( - { - "title": self.title, - "parents": [{"id": self.parent_id}], - "mimeType": FOLDER_MIME_TYPE, - } - ) + item = self.drive.CreateFile({ + "title": self.title, + "parents": [{ + "id": self.parent_id + }], + "mimeType": FOLDER_MIME_TYPE, + }) item.Upload() return item class RequestUploadFile(RequestBASE): - def __init__( - self, args, no_progress_bar=True, from_file="", progress_name="" - ): + def __init__(self, + args, + no_progress_bar=True, + from_file="", + progress_name=""): super(RequestUploadFile, self).__init__(args["drive"]) self.title = args["title"] self.parent_id = args["parent_id"] @@ -69,17 +74,19 @@ def __init__( def upload(self, item): with open(self.from_file, "rb") as from_file: if not self.no_progress_bar: - from_file = TrackFileReadProgress( - self.progress_name, from_file - ) + from_file = TrackFileReadProgress(self.progress_name, + from_file) if os.stat(self.from_file).st_size: item.content = from_file item.Upload() def execute(self): - item = self.drive.CreateFile( - {"title": self.title, "parents": [{"id": self.parent_id}]} - ) + item = self.drive.CreateFile({ + "title": self.title, + "parents": [{ + "id": self.parent_id + }] + }) self.upload(item) return item @@ -97,9 +104,8 @@ def execute(self): gdrive_file = self.drive.CreateFile({"id": self.file_id}) if not self.no_progress_bar: - tqdm = Tqdm( - desc=self.progress_name, total=int(gdrive_file["fileSize"]) - ) + tqdm = Tqdm(desc=self.progress_name, + total=int(gdrive_file["fileSize"])) gdrive_file.GetContentFile(self.to_file) if not self.no_progress_bar: tqdm.close() diff --git a/dvc/remote/gdrive/utils.py b/dvc/remote/gdrive/utils.py index 0f3cf02cd0..e067e2b737 100644 --- a/dvc/remote/gdrive/utils.py +++ b/dvc/remote/gdrive/utils.py @@ -2,7 +2,6 @@ from dvc.progress import Tqdm - FOLDER_MIME_TYPE = "application/vnd.google-apps.folder" diff --git a/setup.py b/setup.py index c6f5b39f2e..cf772a1786 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,6 @@ # see https://github.com/ninjaaron/fast-entry_points. # This saves about 200 ms on startup time for non-wheel installs. - # https://packaging.python.org/guides/single-sourcing-package-version/ pkg_dir = os.path.dirname(os.path.abspath(__file__)) version_path = os.path.join(pkg_dir, "dvc", "version.py") diff --git a/tests/func/test_data_cloud.py b/tests/func/test_data_cloud.py index ad0d33020b..23a3e8f281 100644 --- a/tests/func/test_data_cloud.py +++ b/tests/func/test_data_cloud.py @@ -37,13 +37,16 @@ from tests.basic_env import TestDvc from tests.utils import spy - TEST_REMOTE = "upstream" TEST_SECTION = 'remote "{}"'.format(TEST_REMOTE) TEST_CONFIG = { Config.SECTION_CACHE: {}, - Config.SECTION_CORE: {Config.SECTION_CORE_REMOTE: TEST_REMOTE}, - TEST_SECTION: {Config.SECTION_REMOTE_URL: ""}, + Config.SECTION_CORE: { + Config.SECTION_CORE_REMOTE: TEST_REMOTE + }, + TEST_SECTION: { + Config.SECTION_REMOTE_URL: "" + }, } TEST_AWS_REPO_BUCKET = os.environ.get("DVC_TEST_AWS_REPO_BUCKET", "dvc-test") @@ -54,14 +57,12 @@ os.environ.get( "GOOGLE_APPLICATION_CREDENTIALS", os.path.join("scripts", "ci", "gcp-creds.json"), - ) -) + )) # Ensure that absolute path is used os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = TEST_GCP_CREDS_FILE TEST_GDRIVE_CLIENT_ID = ( - "719861249063-v4an78j9grdtuuuqg3lnm0sugna6v3lh.apps.googleusercontent.com" -) + "719861249063-v4an78j9grdtuuuqg3lnm0sugna6v3lh.apps.googleusercontent.com") TEST_GDRIVE_CLIENT_SECRET = "2fy_HyzSwkxkGzEken7hThXb" @@ -92,15 +93,13 @@ def _should_test_gcp(): return False try: - check_output( - [ - "gcloud", - "auth", - "activate-service-account", - "--key-file", - TEST_GCP_CREDS_FILE, - ] - ) + check_output([ + "gcloud", + "auth", + "activate-service-account", + "--key-file", + TEST_GCP_CREDS_FILE, + ]) except (CalledProcessError, OSError): return False return True @@ -112,8 +111,7 @@ def _should_test_azure(): return do_test return os.getenv("AZURE_STORAGE_CONTAINER_NAME") and os.getenv( - "AZURE_STORAGE_CONNECTION_STRING" - ) + "AZURE_STORAGE_CONNECTION_STRING") def _should_test_oss(): @@ -121,11 +119,8 @@ def _should_test_oss(): if do_test is not None: return do_test - return ( - os.getenv("OSS_ENDPOINT") - and os.getenv("OSS_ACCESS_KEY_ID") - and os.getenv("OSS_ACCESS_KEY_SECRET") - ) + return (os.getenv("OSS_ENDPOINT") and os.getenv("OSS_ACCESS_KEY_ID") + and os.getenv("OSS_ACCESS_KEY_SECRET")) def _should_test_ssh(): @@ -150,9 +145,9 @@ def _should_test_hdfs(): return False try: - check_output( - ["hadoop", "version"], shell=True, executable=os.getenv("SHELL") - ) + check_output(["hadoop", "version"], + shell=True, + executable=os.getenv("SHELL")) except (CalledProcessError, IOError): return False @@ -177,9 +172,8 @@ def get_local_url(): def get_ssh_url(): - return "ssh://{}@127.0.0.1:22{}".format( - getpass.getuser(), get_local_storagepath() - ) + return "ssh://{}@127.0.0.1:22{}".format(getpass.getuser(), + get_local_storagepath()) def get_ssh_url_mocked(user, port): @@ -202,9 +196,8 @@ def get_ssh_url_mocked(user, port): def get_hdfs_url(): - return "hdfs://{}@127.0.0.1{}".format( - getpass.getuser(), get_local_storagepath() - ) + return "hdfs://{}@127.0.0.1{}".format(getpass.getuser(), + get_local_storagepath()) def get_aws_storagepath(): @@ -281,9 +274,8 @@ def _get_keyfile(self): def _ensure_should_run(self): if not self._should_test(): - raise SkipTest( - "Test {} is disabled".format(self.__class__.__name__) - ) + raise SkipTest("Test {} is disabled".format( + self.__class__.__name__)) def _setup_cloud(self): self._ensure_should_run() @@ -404,11 +396,9 @@ def _setup_cloud(self): config = copy.deepcopy(TEST_CONFIG) config[TEST_SECTION][Config.SECTION_REMOTE_URL] = repo config[TEST_SECTION][ - Config.SECTION_GDRIVE_CLIENT_ID - ] = TEST_GDRIVE_CLIENT_ID + Config.SECTION_GDRIVE_CLIENT_ID] = TEST_GDRIVE_CLIENT_ID config[TEST_SECTION][ - Config.SECTION_GDRIVE_CLIENT_SECRET - ] = TEST_GDRIVE_CLIENT_SECRET + Config.SECTION_GDRIVE_CLIENT_SECRET] = TEST_GDRIVE_CLIENT_SECRET self.dvc.config.config = config self.cloud = DataCloud(self.dvc) @@ -433,8 +423,7 @@ def _setup_cloud(self): config = copy.deepcopy(TEST_CONFIG) config[TEST_SECTION][Config.SECTION_REMOTE_URL] = repo config[TEST_SECTION][ - Config.SECTION_GCP_CREDENTIALPATH - ] = TEST_GCP_CREDS_FILE + Config.SECTION_GCP_CREDENTIALPATH] = TEST_GCP_CREDS_FILE self.dvc.config.config = config self.cloud = DataCloud(self.dvc) @@ -616,9 +605,8 @@ def _test(self): def test(self): if not self._should_test(): - raise SkipTest( - "Test {} is disabled".format(self.__class__.__name__) - ) + raise SkipTest("Test {} is disabled".format( + self.__class__.__name__)) self._test() @@ -675,24 +663,20 @@ def _test(self): url = get_gdrive_url() self.main(["remote", "add", TEST_REMOTE, url]) - self.main( - [ - "remote", - "modify", - TEST_REMOTE, - Config.SECTION_GDRIVE_CLIENT_ID, - TEST_GDRIVE_CLIENT_ID, - ] - ) - self.main( - [ - "remote", - "modify", - TEST_REMOTE, - Config.SECTION_GDRIVE_CLIENT_SECRET, - TEST_GDRIVE_CLIENT_SECRET, - ] - ) + self.main([ + "remote", + "modify", + TEST_REMOTE, + Config.SECTION_GDRIVE_CLIENT_ID, + TEST_GDRIVE_CLIENT_ID, + ]) + self.main([ + "remote", + "modify", + TEST_REMOTE, + Config.SECTION_GDRIVE_CLIENT_SECRET, + TEST_GDRIVE_CLIENT_SECRET, + ]) self._test_cloud(TEST_REMOTE) @@ -705,15 +689,13 @@ def _test(self): url = get_gcp_url() self.main(["remote", "add", TEST_REMOTE, url]) - self.main( - [ - "remote", - "modify", - TEST_REMOTE, - "credentialpath", - TEST_GCP_CREDS_FILE, - ] - ) + self.main([ + "remote", + "modify", + TEST_REMOTE, + "credentialpath", + TEST_GCP_CREDS_FILE, + ]) self._test_cloud(TEST_REMOTE) @@ -778,8 +760,7 @@ def _test(self): expected_warning = ( "Output 'bar'(Stage: 'bar.dvc') is missing version info." " Cache for it will not be collected." - " Use dvc repro to get your pipeline up to date." - ) + " Use dvc repro to get your pipeline up to date.") assert expected_warning in self._caplog.text @@ -872,9 +853,8 @@ def test(self): class TestCheckSumRecalculation(TestDvc): def test(self): test_get_file_checksum = spy(RemoteLOCAL.get_file_checksum) - with patch.object( - RemoteLOCAL, "get_file_checksum", test_get_file_checksum - ): + with patch.object(RemoteLOCAL, "get_file_checksum", + test_get_file_checksum): url = get_local_url() ret = main(["remote", "add", "-d", TEST_REMOTE, url]) self.assertEqual(ret, 0) @@ -910,14 +890,11 @@ def setUp(self): checksum_bar = file_md5(self.BAR)[0] self.message_header = ( "Some of the cache files do not exist neither locally " - "nor on remote. Missing cache files: " - ) + "nor on remote. Missing cache files: ") self.message_bar_part = "name: {}, md5: {}".format( - self.BAR, checksum_bar - ) + self.BAR, checksum_bar) self.message_foo_part = "name: {}, md5: {}".format( - self.FOO, checksum_foo - ) + self.FOO, checksum_foo) def test(self): self._caplog.clear()