-
Notifications
You must be signed in to change notification settings - Fork 1.3k
remote: implement Google Drive #2040
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1 @@ | ||
| include dvc/remote/gdrive/google-dvc-client-id.json |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,202 @@ | ||
| from __future__ import unicode_literals | ||
|
|
||
| import os | ||
| import logging | ||
|
|
||
| try: | ||
| import google_auth_oauthlib | ||
| from dvc.remote.gdrive.client import GDriveClient | ||
| except ImportError: | ||
| google_auth_oauthlib = None | ||
|
|
||
| from dvc.scheme import Schemes | ||
| from dvc.path_info import CloudURLInfo | ||
| from dvc.remote.base import RemoteBASE | ||
| from dvc.config import Config | ||
| from dvc.remote.gdrive.utils import ( | ||
| TrackFileReadProgress, | ||
| only_once, | ||
| metadata_isdir, | ||
| shared_token_warning, | ||
| ) | ||
| from dvc.remote.gdrive.exceptions import GDriveError, GDriveResourceNotFound | ||
|
|
||
|
|
||
| logger = logging.getLogger(__name__) | ||
|
|
||
|
|
||
| class GDriveURLInfo(CloudURLInfo): | ||
| @property | ||
| def netloc(self): | ||
| return self.parsed.netloc | ||
|
|
||
|
|
||
| class RemoteGDrive(RemoteBASE): | ||
| """Google Drive remote implementation | ||
| ## Some notes on Google Drive design | ||
| Google Drive differs from S3 and GS remotes - it identifies the resources | ||
| by IDs instead of paths. | ||
| Folders are regular resources with an `application/vnd.google-apps.folder` | ||
| MIME type. Resource can have multiple parent folders, and also there could | ||
| be multiple resources with the same name linked to a single folder, so | ||
| files could be duplicated. | ||
| There are multiple root folders accessible from a single user account: | ||
| - `root` (special ID) - alias for the "My Drive" folder | ||
| - `appDataFolder` (special ID) - alias for the hidden application | ||
| space root folder | ||
| - shared drives root folders | ||
| ## Example URLs | ||
| - Datasets/my-dataset inside "My Drive" folder: | ||
| gdrive://root/Datasets/my-dataset | ||
| - Folder by ID (recommended): | ||
| gdrive://1r3UbnmS5B4-7YZPZmyqJuCxLVps1mASC | ||
| (get it https://drive.google.com/drive/folders/{here}) | ||
| - Dataset named "my-dataset" in the hidden application folder: | ||
| gdrive://appDataFolder/my-dataset | ||
| (this one wouldn't be visible through Google Drive web UI and | ||
ei-grad marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| couldn't be shared) | ||
| """ | ||
|
|
||
| scheme = Schemes.GDRIVE | ||
| path_cls = GDriveURLInfo | ||
| REGEX = r"^gdrive://.*$" | ||
| REQUIRES = {"google-auth-oauthlib": google_auth_oauthlib} | ||
| PARAM_CHECKSUM = "md5Checksum" | ||
| SPACE_DRIVE = "drive" | ||
| SCOPE_DRIVE = "https://www.googleapis.com/auth/drive" | ||
| SPACE_APPDATA = "appDataFolder" | ||
| SCOPE_APPDATA = "https://www.googleapis.com/auth/drive.appdata" | ||
| DEFAULT_OAUTH_ID = "default" | ||
|
|
||
| # Default credential is needed to show the string of "Data Version | ||
| # Control" in OAuth dialog application name and icon in authorized | ||
| # applications list in Google account security settings. Also, the | ||
| # quota usage is limited by the application defined by client_id. | ||
| # The good practice would be to suggest the user to create their | ||
| # own application credentials. | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. what are the default limits? how hard would it be to increase it? |
||
| DEFAULT_CREDENTIALPATH = os.path.join( | ||
| os.path.dirname(__file__), "google-dvc-client-id.json" | ||
| ) | ||
|
|
||
| def __init__(self, repo, config): | ||
| super(RemoteGDrive, self).__init__(repo, config) | ||
ei-grad marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| self.path_info = self.path_cls(config[Config.SECTION_REMOTE_URL]) | ||
| self.root = self.path_info.netloc.lower() | ||
| if self.root == self.SPACE_APPDATA.lower(): | ||
| default_scopes = self.SCOPE_APPDATA | ||
| space = self.SPACE_APPDATA | ||
| else: | ||
| default_scopes = self.SCOPE_DRIVE | ||
| space = self.SPACE_DRIVE | ||
| if Config.SECTION_GDRIVE_CREDENTIALPATH not in config: | ||
| shared_token_warning() | ||
| credentialpath = config.get( | ||
| Config.SECTION_GDRIVE_CREDENTIALPATH, | ||
| self.DEFAULT_CREDENTIALPATH, | ||
| ) | ||
| scopes = config.get(Config.SECTION_GDRIVE_SCOPES, default_scopes) | ||
| # scopes should be a list and it is space-delimited in all | ||
| # configs, and `.split()` also works for a single-element list | ||
| scopes = scopes.split() | ||
|
|
||
| core_config = self.repo.config.config[Config.SECTION_CORE] | ||
| oauth2_flow_runner = core_config.get( | ||
| Config.SECTION_CORE_OAUTH2_FLOW_RUNNER, "console" | ||
| ) | ||
|
|
||
| self.client = GDriveClient( | ||
| space, | ||
| config.get(Config.SECTION_GDRIVE_OAUTH_ID, self.DEFAULT_OAUTH_ID), | ||
| credentialpath, | ||
| scopes, | ||
| oauth2_flow_runner, | ||
| ) | ||
|
|
||
| def get_file_checksum(self, path_info): | ||
| metadata = self.client.get_metadata(path_info, fields=["md5Checksum"]) | ||
| return metadata["md5Checksum"] | ||
|
|
||
| def exists(self, path_info): | ||
| return self.client.exists(path_info) | ||
|
|
||
| def batch_exists(self, path_infos, callback): | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
| results = [] | ||
| for path_info in path_infos: | ||
| results.append(self.exists(path_info)) | ||
| callback.update(str(path_info)) | ||
| return results | ||
|
|
||
| def list_cache_paths(self): | ||
ei-grad marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| try: | ||
| root = self.client.get_metadata(self.path_info) | ||
| except GDriveResourceNotFound as e: | ||
| logger.debug("list_cache_paths: {}".format(e)) | ||
| else: | ||
| prefix = self.path_info.path | ||
| for i in self.client.list_children(root["id"]): | ||
| yield prefix + "/" + i | ||
|
|
||
| @only_once | ||
| def mkdir(self, parent, name): | ||
| return self.client.mkdir(parent, name) | ||
|
|
||
| def makedirs(self, path_info): | ||
| parent = path_info.netloc | ||
| parts = iter(path_info.path.split("/")) | ||
| current_path = ["gdrive://" + path_info.netloc] | ||
| for part in parts: | ||
| try: | ||
| metadata = self.client.get_metadata( | ||
| self.path_cls.from_parts( | ||
| self.scheme, parent, path="/" + part | ||
| ) | ||
| ) | ||
| except GDriveResourceNotFound: | ||
| break | ||
| else: | ||
| current_path.append(part) | ||
| if not metadata_isdir(metadata): | ||
| raise GDriveError( | ||
| "{} is not a folder".format("/".join(current_path)) | ||
| ) | ||
| parent = metadata["id"] | ||
| to_create = [part] + list(parts) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. should it be sublist of parts here?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. parts is a partially consumed iterator, but yeah, I'll rewrite it
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The exception/break condition was also valid, but DeepSource suggests too that iterating over iterator with the break/else construct and using the loop variable is something not readable and bug-risky. :) |
||
| for part in to_create: | ||
| parent = self.mkdir(parent, part)["id"] | ||
| return parent | ||
|
|
||
| def _upload(self, from_file, to_info, name, no_progress_bar): | ||
|
|
||
| dirname = to_info.parent.path | ||
| if dirname: | ||
| try: | ||
| parent = self.client.get_metadata(to_info.parent) | ||
| except GDriveResourceNotFound: | ||
| parent = self.makedirs(to_info.parent) | ||
| else: | ||
| parent = to_info.netloc | ||
|
|
||
| from_file = open(from_file, "rb") | ||
| if not no_progress_bar: | ||
| from_file = TrackFileReadProgress(name, from_file) | ||
|
|
||
| try: | ||
| self.client.upload(parent, to_info, from_file) | ||
| finally: | ||
| from_file.close() | ||
|
|
||
| def _download(self, from_info, to_file, name, no_progress_bar): | ||
| self.client.download(from_info, to_file, name, no_progress_bar) | ||
Uh oh!
There was an error while loading. Please reload this page.