From c2b60ee14605e79888083b556a75d500b44dace7 Mon Sep 17 00:00:00 2001 From: nanaya Date: Thu, 2 May 2019 14:59:41 +0800 Subject: [PATCH 01/10] remote: add support for aliyun oss Usage: $ dvc remote add myremote oss://my-bucket.endpoint/path Set key id and key secret using modify command $ dvc remote modify myremote oss_key_id my-key-id $ dvc remote modify myremote oss_key_secret my-key-secret or environment variables $ export OSS_ACCESS_KEY_ID="my-key-id" $ export OSS_ACCESS_KEY_SECRET="my-key-secret" Ref: oss python SDK: https://www.alibabacloud.com/help/doc-detail/32026.htm --- README.rst | 8 +- dvc/cache.py | 1 + dvc/config.py | 7 ++ dvc/data_cloud.py | 2 + dvc/remote/__init__.py | 2 + dvc/remote/oss.py | 229 ++++++++++++++++++++++++++++++++++ dvc/repo/__init__.py | 1 + dvc/repo/gc.py | 3 + requirements.txt | 1 + setup.py | 4 +- tests/func/test_data_cloud.py | 49 ++++++++ tests/unit/remote/test_oss.py | 26 ++++ 12 files changed, 328 insertions(+), 5 deletions(-) create mode 100644 dvc/remote/oss.py create mode 100644 tests/unit/remote/test_oss.py diff --git a/README.rst b/README.rst index 76cbf370b5..fabda7040f 100644 --- a/README.rst +++ b/README.rst @@ -24,7 +24,7 @@ .. image:: https://codecov.io/gh/iterative/dvc/branch/master/graph/badge.svg :target: https://codecov.io/gh/iterative/dvc :alt: Codecov - + .. image:: https://img.shields.io/badge/patreon-donate-green.svg :target: https://www.patreon.com/DVCorg/overview :alt: Donate @@ -38,7 +38,7 @@ machine learning projects. Key features: any databases. Does not depend on any proprietary online services; #. it manages and versions **datasets** and **machine learning models**. Data is saved in - S3, Google cloud, Azure, SSH server, HDFS or even local HDD RAID; + S3, Google cloud, Azure, Alibaba cloud, SSH server, HDFS or even local HDD RAID; #. it makes projects **reproducible** and **shareable**, it helps answering question how the model was build; @@ -63,7 +63,7 @@ made right and tailored specifically for ML and Data Science scenarios. #. ``Git/Git-lfs`` part - DVC helps you storing and sharing data artifacts, models. It connects them with your Git repository. -#. ``Makefiles`` part - DVC describes how one data or model artifact was build from another data. +#. ``Makefiles`` part - DVC describes how one data or model artifact was build from another data. DVC usually runs along with Git. Git is used as usual to store and version code and DVC meta-files. DVC helps to store data and model files seamlessly out of Git while preserving almost the same user experience as if they @@ -118,7 +118,7 @@ pip (PyPI) pip install dvc Depending on the remote storage type you plan to use to keep and share your data, you might need to specify -one of the optional dependencies: ``s3``, ``gs``, ``azure``, ``ssh``. Or ``all_remotes`` to include them all. +one of the optional dependencies: ``s3``, ``gs``, ``azure``, ``oss``, ``ssh``. Or ``all_remotes`` to include them all. The command should look like this: ``pip install dvc[s3]`` - it installs the ``boto3`` library along with DVC to support the AWS S3 storage. diff --git a/dvc/cache.py b/dvc/cache.py index e1b9d7b5d2..54bba75dbe 100644 --- a/dvc/cache.py +++ b/dvc/cache.py @@ -45,6 +45,7 @@ def __init__(self, repo): self.local = Remote(repo, settings) self.s3 = self._get_remote(config, Config.SECTION_CACHE_S3) self.gs = self._get_remote(config, Config.SECTION_CACHE_GS) + self.oss = self._get_remote(config, Config.SECTION_CACHE_OSS) self.ssh = self._get_remote(config, Config.SECTION_CACHE_SSH) self.hdfs = self._get_remote(config, Config.SECTION_CACHE_HDFS) self.azure = self._get_remote(config, Config.SECTION_CACHE_AZURE) diff --git a/dvc/config.py b/dvc/config.py index 89fb00497f..6aae5180da 100644 --- a/dvc/config.py +++ b/dvc/config.py @@ -156,6 +156,7 @@ class Config(object): # pylint: disable=too-many-instance-attributes SECTION_CACHE_SSH = "ssh" SECTION_CACHE_HDFS = "hdfs" SECTION_CACHE_AZURE = "azure" + SECTION_CACHE_OSS = "oss" SECTION_CACHE_SLOW_LINK_WARNING = "slow_link_warning" SECTION_CACHE_SCHEMA = { Optional(SECTION_CACHE_LOCAL): str, @@ -164,6 +165,7 @@ class Config(object): # pylint: disable=too-many-instance-attributes Optional(SECTION_CACHE_HDFS): str, Optional(SECTION_CACHE_SSH): str, Optional(SECTION_CACHE_AZURE): str, + Optional(SECTION_CACHE_OSS): str, Optional(SECTION_CACHE_DIR): str, Optional(SECTION_CACHE_TYPE, default=None): SECTION_CACHE_TYPE_SCHEMA, Optional(SECTION_CACHE_PROTECTED, default=False): BOOL_SCHEMA, @@ -227,6 +229,9 @@ class Config(object): # pylint: disable=too-many-instance-attributes SECTION_LOCAL_SCHEMA = {SECTION_LOCAL_STORAGEPATH: str} SECTION_AZURE_CONNECTION_STRING = "connection_string" + # Alibabacloud oss options + SECTION_OSS_ACCESS_KEY_ID = "oss_key_id" + SECTION_OSS_ACCESS_KEY_SECRET = "oss_key_secret" SECTION_REMOTE_REGEX = r'^\s*remote\s*"(?P.*)"\s*$' SECTION_REMOTE_FMT = 'remote "{}"' @@ -255,6 +260,8 @@ class Config(object): # pylint: disable=too-many-instance-attributes Optional(SECTION_REMOTE_PASSWORD): str, Optional(SECTION_REMOTE_ASK_PASSWORD): BOOL_SCHEMA, Optional(SECTION_AZURE_CONNECTION_STRING): str, + Optional(SECTION_OSS_ACCESS_KEY_ID): str, + Optional(SECTION_OSS_ACCESS_KEY_SECRET): str, Optional(PRIVATE_CWD): str, } diff --git a/dvc/data_cloud.py b/dvc/data_cloud.py index f9704336b9..73649f06be 100644 --- a/dvc/data_cloud.py +++ b/dvc/data_cloud.py @@ -9,6 +9,7 @@ from dvc.remote.s3 import RemoteS3 from dvc.remote.gs import RemoteGS from dvc.remote.azure import RemoteAzure +from dvc.remote.oss import RemoteOSS from dvc.remote.ssh import RemoteSSH from dvc.remote.hdfs import RemoteHDFS from dvc.remote.local import RemoteLOCAL @@ -34,6 +35,7 @@ class DataCloud(object): "aws": RemoteS3, "gcp": RemoteGS, "azure": RemoteAzure, + "oss": RemoteOSS, "ssh": RemoteSSH, "hdfs": RemoteHDFS, "local": RemoteLOCAL, diff --git a/dvc/remote/__init__.py b/dvc/remote/__init__.py index 94b274f4a4..5c35c038d8 100644 --- a/dvc/remote/__init__.py +++ b/dvc/remote/__init__.py @@ -7,6 +7,7 @@ from dvc.remote.s3 import RemoteS3 from dvc.remote.ssh import RemoteSSH from dvc.remote.http import RemoteHTTP +from dvc.remote.oss import RemoteOSS REMOTES = [ @@ -16,6 +17,7 @@ RemoteHTTP, RemoteS3, RemoteSSH, + RemoteOSS, # NOTE: RemoteLOCAL is the default ] diff --git a/dvc/remote/oss.py b/dvc/remote/oss.py new file mode 100644 index 0000000000..02053feafe --- /dev/null +++ b/dvc/remote/oss.py @@ -0,0 +1,229 @@ +from __future__ import absolute_import +from __future__ import unicode_literals + +import os +import re +import string +import logging + +try: + import oss2 +except ImportError: + oss2 = None + +from dvc.utils import tmp_fname, move +from dvc.utils.compat import urlparse, makedirs +from dvc.progress import progress +from dvc.config import Config +from dvc.remote.base import RemoteBase + + +logger = logging.getLogger(__name__) + + +class Callback(object): + def __init__(self, name): + self.name = name + + def __call__(self, current, total): + progress.update_target(self.name, current, total) + + +class RemoteOSS(RemoteBase): + """ + oss2 document: + https://www.alibabacloud.com/help/doc-detail/32026.htm + + + Examples + ---------- + $ dvc remote add myremote oss://my-bucket.endpoint/path + Set key id and key secret using modify command + $ dvc remote modify myremote oss_key_id my-key-id + $ dvc remote modify myremote oss_key_secret my-key-secret + or environment variables + $ export OSS_ACCESS_KEY_ID="my-key-id" + $ export OSS_ACCESS_KEY_SECRET="my-key-secret" + """ + + scheme = "oss" + REGEX = ( + r"^oss://(?P(?P.*?)\.(?P.*?))?(?P/.*)?$" + ) + REQUIRES = {"oss2": oss2} + PARAM_CHECKSUM = "etag" + COPY_POLL_SECONDS = 5 + + def __init__(self, repo, config): + super(RemoteOSS, self).__init__(repo, config) + + self.url = config.get(Config.SECTION_REMOTE_URL) + match = re.match(self.REGEX, self.url) # backward compatibility + + self.bucket = match.group("bucket") or os.getenv("OSS_BUCKET") + if not self.bucket: + raise ValueError("oss bucket name is missing") + self.bucket = self.bucket.lower() + valid_chars = set(string.ascii_lowercase) | set(string.digits) | {"-"} + if ( + len(set(self.bucket) - valid_chars) != 0 + or self.bucket[0] == "-" + or self.bucket[-1] == "-" + or len(self.bucket) < 3 + or len(self.bucket) > 63 + ): + raise ValueError( + "oss bucket name should only contrains lowercase " + "alphabet letters, digits and - with length " + "between 3 and 63" + ) + + self.endpoint = match.group("endpoint") or os.getenv("OSS_ENDPOINT") + if not self.endpoint: + raise ValueError("oss endpoint is missing") + + path = match.group("path") + self.prefix = urlparse(self.url).path.lstrip("/") if path else "" + + self.key_id = config.get( + Config.SECTION_OSS_ACCESS_KEY_ID + ) or os.getenv("OSS_ACCESS_KEY_ID") + if not self.key_id: + raise ValueError("oss access key id is missing") + + self.key_secret = config.get( + Config.SECTION_OSS_ACCESS_KEY_SECRET + ) or os.getenv("OSS_ACCESS_KEY_SECRET") + if not self.key_secret: + raise ValueError("oss access key secret is missing") + + self._bucket = None + self.path_info = { + "scheme": self.scheme, + "bucket": self.bucket, + "endpoint": self.endpoint, + } + + @property + def oss_service(self): + if self._bucket is None: + logger.debug("URL {}".format(self.url)) + logger.debug("key id {}".format(self.key_id)) + logger.debug("key secret {}".format(self.key_secret)) + auth = oss2.Auth(self.key_id, self.key_secret) + logger.debug("bucket name {}".format(self.bucket)) + self._bucket = oss2.Bucket(auth, self.endpoint, self.bucket) + try: # verify that bucket exists + self._bucket.get_bucket_info() + except oss2.exceptions.NoSuchBucket: + self._bucket.create_bucket( + oss2.BUCKET_ACL_PUBLIC_READ, + oss2.models.BucketCreateConfig( + oss2.BUCKET_STORAGE_CLASS_STANDARD + ), + ) + return self._bucket + + def remove(self, path_info): + if path_info["scheme"] != self.scheme: + raise NotImplementedError + + logger.debug( + "Removing oss://{}.{}/{}".format( + path_info["bucket"], path_info["endpoint"], path_info["path"] + ) + ) + + self.oss_service.delete_object(path_info["path"]) + + def _list_paths(self, prefix): + for blob in oss2.ObjectIterator(self.oss_service, prefix=prefix): + yield blob.key + + def list_cache_paths(self): + return self._list_paths(self.prefix) + + def upload(self, from_infos, to_infos, names=None, no_progress_bar=False): + names = self._verify_path_args(to_infos, from_infos, names) + + for from_info, to_info, name in zip(from_infos, to_infos, names): + if to_info["scheme"] != self.scheme: + raise NotImplementedError + + if from_info["scheme"] != "local": + raise NotImplementedError + + bucket = to_info["bucket"] + path = to_info["path"] + endpoint = to_info["endpoint"] + + logger.debug( + "Uploading '{}' to 'oss://{}.{}/{}'".format( + from_info["path"], bucket, endpoint, path + ) + ) + + if not name: + name = os.path.basename(from_info["path"]) + + cb = None if no_progress_bar else Callback(name) + + try: + self.oss_service.put_object_from_file( + path, from_info["path"], progress_callback=cb + ) + except Exception: + msg = "failed to upload '{}'".format(from_info["path"]) + logger.warning(msg) + else: + progress.finish_target(name) + + def download( + self, + from_infos, + to_infos, + names=None, + no_progress_bar=False, + resume=False, + ): + names = self._verify_path_args(from_infos, to_infos, names) + + for to_info, from_info, name in zip(to_infos, from_infos, names): + if from_info["scheme"] != self.scheme: + raise NotImplementedError + + if to_info["scheme"] != "local": + raise NotImplementedError + + bucket = from_info["bucket"] + path = from_info["path"] + endpoint = from_info["endpoint"] + + logger.debug( + "Downloading 'oss://{}.{}/{}' to '{}'".format( + bucket, endpoint, path, to_info["path"] + ) + ) + + tmp_file = tmp_fname(to_info["path"]) + if not name: + name = os.path.basename(to_info["path"]) + + cb = None if no_progress_bar else Callback(name) + + makedirs(os.path.dirname(to_info["path"]), exist_ok=True) + + try: + self.oss_service.get_object_to_file( + path, tmp_file, progress_callback=cb + ) + except Exception: + msg = "failed to download 'oss://{}.{}/{}'".format( + bucket, endpoint, path + ) + logger.warning(msg) + else: + move(tmp_file, to_info["path"]) + + if not no_progress_bar: + progress.finish_target(name) diff --git a/dvc/repo/__init__.py b/dvc/repo/__init__.py index 659541bd1c..25fafdf830 100644 --- a/dvc/repo/__init__.py +++ b/dvc/repo/__init__.py @@ -255,6 +255,7 @@ def used_cache( cache["local"] = [] cache["s3"] = [] cache["gs"] = [] + cache["oss"] = [] cache["hdfs"] = [] cache["ssh"] = [] cache["azure"] = [] diff --git a/dvc/repo/gc.py b/dvc/repo/gc.py index 58ed42e59a..7b300c3c56 100644 --- a/dvc/repo/gc.py +++ b/dvc/repo/gc.py @@ -109,5 +109,8 @@ def gc( if self.cache.azure: _do_gc("azure", self.cache.azure.gc, clist) + if self.cache.oss: + _do_gc("oss", self.cache.oss.gc, clist) + if cloud: _do_gc("remote", self.cloud._get_cloud(remote, "gc -c").gc, clist) diff --git a/requirements.txt b/requirements.txt index d65d325cb0..eb59e5ce46 100644 --- a/requirements.txt +++ b/requirements.txt @@ -30,3 +30,4 @@ humanize>=0.5.1 dulwich>=0.19.11 ruamel.yaml>=0.15.91 pathlib2==2.3.3; python_version == "2.7" +oss2==2.6.1 diff --git a/setup.py b/setup.py index fb4a66eef0..d627f1d502 100644 --- a/setup.py +++ b/setup.py @@ -65,8 +65,9 @@ def run(self): gs = ["google-cloud-storage==1.13.0"] s3 = ["boto3==1.9.115"] azure = ["azure-storage-blob==1.3.0"] +oss = ["oss2==2.6.1"] ssh = ["paramiko>=2.4.1"] -all_remotes = gs + s3 + azure + ssh +all_remotes = gs + s3 + azure + ssh + oss setup( name="dvc", @@ -83,6 +84,7 @@ def run(self): "gs": gs, "s3": s3, "azure": azure, + "oss": oss, "ssh": ssh, # NOTE: https://github.com/inveniosoftware/troubleshooting/issues/1 ':python_version=="2.7"': ["futures", "pathlib2"], diff --git a/tests/func/test_data_cloud.py b/tests/func/test_data_cloud.py index d8a8a6921d..d3ef285f62 100644 --- a/tests/func/test_data_cloud.py +++ b/tests/func/test_data_cloud.py @@ -20,6 +20,7 @@ RemoteS3, RemoteGS, RemoteAzure, + RemoteOSS, RemoteLOCAL, RemoteSSH, RemoteHDFS, @@ -99,6 +100,20 @@ def _should_test_azure(): ) +def _should_test_oss(): + if os.getenv("DVC_TEST_OSS") == "true": + return True + elif os.getenv("DVC_TEST_OSS") == "false": + return False + + return ( + os.getenv("OSS_BUCKET") + and os.getenv("OSS_ENDPOINT") + and os.getenv("OSS_ACCESS_KEY_ID") + and os.getenv("OSS_ACCESS_KEY_SECRET") + ) + + def _should_test_ssh(): if os.getenv("DVC_TEST_SSH") == "true": return True @@ -208,6 +223,16 @@ def get_azure_url(): return "azure://{}/{}".format(container_name, str(uuid.uuid4())) +def get_oss_storagepath(): + return "{}.{}/{}".format( + os.getenv("OSS_BUCKET"), os.getenv("OSS_ENDPOINT"), str(uuid.uuid4()) + ) + + +def get_oss_url(): + return "oss://{}".format(get_oss_storagepath()) + + class TestDataCloud(TestDvc): def _test_cloud(self, config, cl): cloud = DataCloud(self.dvc, config=config) @@ -222,6 +247,7 @@ def test(self): ("ssh://user@localhost:/", RemoteSSH), ("http://localhost:8000/", RemoteHTTP), ("azure://ContainerName=mybucket;conn_string;", RemoteAzure), + ("oss://bucket.endpoint/path", RemoteOSS), (TestDvc.mkdtemp(), RemoteLOCAL), ] @@ -395,6 +421,17 @@ def _get_url(self): return get_azure_url() +class TestRemoteOSS(TestDataCloudBase): + def _should_test(self): + return _should_test_oss() + + def _get_url(self): + return get_oss_url() + + def _get_cloud_class(self): + return RemoteOSS + + class TestRemoteLOCAL(TestDataCloudBase): def _should_test(self): return True @@ -650,6 +687,18 @@ def _test(self): self._test_cloud(TEST_REMOTE) +class TestRemoteOSSCLI(TestDataCloudCLIBase): + def _should_test(self): + return _should_test_azure() + + def _test(self): + url = get_oss_url() + + self.main(["remote", "add", TEST_REMOTE, url]) + + self._test_cloud(TEST_REMOTE) + + class TestDataCloudErrorCLI(TestDvc): def main_fail(self, args): ret = main(args) diff --git a/tests/unit/remote/test_oss.py b/tests/unit/remote/test_oss.py new file mode 100644 index 0000000000..3adeebe958 --- /dev/null +++ b/tests/unit/remote/test_oss.py @@ -0,0 +1,26 @@ +from unittest import TestCase + +from dvc.remote.oss import RemoteOSS + + +class TestRemoteOSS(TestCase): + bucket_name = "bucket-name" + endpoint = "endpoint" + key_id = "Fq2UVErCz4I6tq" + key_secret = "Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsu" + + def test_init(self): + prefix = "some/prefix" + url = "oss://{}.{}/{}".format(self.bucket_name, self.endpoint, prefix) + config = { + "url": url, + "oss_key_id": self.key_id, + "oss_key_secret": self.key_secret, + } + remote = RemoteOSS(None, config) + self.assertEqual(remote.url, url) + self.assertEqual(remote.prefix, prefix) + self.assertEqual(remote.bucket, self.bucket_name) + self.assertEqual(remote.endpoint, self.endpoint) + self.assertEqual(remote.key_id, self.key_id) + self.assertEqual(remote.key_secret, self.key_secret) From 341b0c648463bed9541a67f5abb299638ce078f1 Mon Sep 17 00:00:00 2001 From: nanaya Date: Sat, 4 May 2019 10:03:25 +0800 Subject: [PATCH 02/10] Not needed, since we don't support external dependencies/outputs on OSS. See https://github.com/iterative/dvc/pull/1961. --- dvc/cache.py | 1 - dvc/config.py | 2 -- dvc/remote/oss.py | 11 +---------- dvc/repo/__init__.py | 1 - dvc/repo/gc.py | 3 --- 5 files changed, 1 insertion(+), 17 deletions(-) diff --git a/dvc/cache.py b/dvc/cache.py index 54bba75dbe..e1b9d7b5d2 100644 --- a/dvc/cache.py +++ b/dvc/cache.py @@ -45,7 +45,6 @@ def __init__(self, repo): self.local = Remote(repo, settings) self.s3 = self._get_remote(config, Config.SECTION_CACHE_S3) self.gs = self._get_remote(config, Config.SECTION_CACHE_GS) - self.oss = self._get_remote(config, Config.SECTION_CACHE_OSS) self.ssh = self._get_remote(config, Config.SECTION_CACHE_SSH) self.hdfs = self._get_remote(config, Config.SECTION_CACHE_HDFS) self.azure = self._get_remote(config, Config.SECTION_CACHE_AZURE) diff --git a/dvc/config.py b/dvc/config.py index 6aae5180da..08c8eb13c2 100644 --- a/dvc/config.py +++ b/dvc/config.py @@ -156,7 +156,6 @@ class Config(object): # pylint: disable=too-many-instance-attributes SECTION_CACHE_SSH = "ssh" SECTION_CACHE_HDFS = "hdfs" SECTION_CACHE_AZURE = "azure" - SECTION_CACHE_OSS = "oss" SECTION_CACHE_SLOW_LINK_WARNING = "slow_link_warning" SECTION_CACHE_SCHEMA = { Optional(SECTION_CACHE_LOCAL): str, @@ -165,7 +164,6 @@ class Config(object): # pylint: disable=too-many-instance-attributes Optional(SECTION_CACHE_HDFS): str, Optional(SECTION_CACHE_SSH): str, Optional(SECTION_CACHE_AZURE): str, - Optional(SECTION_CACHE_OSS): str, Optional(SECTION_CACHE_DIR): str, Optional(SECTION_CACHE_TYPE, default=None): SECTION_CACHE_TYPE_SCHEMA, Optional(SECTION_CACHE_PROTECTED, default=False): BOOL_SCHEMA, diff --git a/dvc/remote/oss.py b/dvc/remote/oss.py index 02053feafe..89eff024e9 100644 --- a/dvc/remote/oss.py +++ b/dvc/remote/oss.py @@ -16,19 +16,12 @@ from dvc.progress import progress from dvc.config import Config from dvc.remote.base import RemoteBase +from dvc.remote.azure import Callback logger = logging.getLogger(__name__) -class Callback(object): - def __init__(self, name): - self.name = name - - def __call__(self, current, total): - progress.update_target(self.name, current, total) - - class RemoteOSS(RemoteBase): """ oss2 document: @@ -187,11 +180,9 @@ def download( resume=False, ): names = self._verify_path_args(from_infos, to_infos, names) - for to_info, from_info, name in zip(to_infos, from_infos, names): if from_info["scheme"] != self.scheme: raise NotImplementedError - if to_info["scheme"] != "local": raise NotImplementedError diff --git a/dvc/repo/__init__.py b/dvc/repo/__init__.py index 25fafdf830..659541bd1c 100644 --- a/dvc/repo/__init__.py +++ b/dvc/repo/__init__.py @@ -255,7 +255,6 @@ def used_cache( cache["local"] = [] cache["s3"] = [] cache["gs"] = [] - cache["oss"] = [] cache["hdfs"] = [] cache["ssh"] = [] cache["azure"] = [] diff --git a/dvc/repo/gc.py b/dvc/repo/gc.py index 7b300c3c56..58ed42e59a 100644 --- a/dvc/repo/gc.py +++ b/dvc/repo/gc.py @@ -109,8 +109,5 @@ def gc( if self.cache.azure: _do_gc("azure", self.cache.azure.gc, clist) - if self.cache.oss: - _do_gc("oss", self.cache.oss.gc, clist) - if cloud: _do_gc("remote", self.cloud._get_cloud(remote, "gc -c").gc, clist) From afbe0e48868234ce90ee36b6178b8d97290ea9cc Mon Sep 17 00:00:00 2001 From: nanaya Date: Sat, 4 May 2019 16:47:47 +0800 Subject: [PATCH 03/10] Add a way to test oss storage using docker. Start a container running an oss emulator. $ git clone https://github.com/nanaya-tachibana/oss-emulator.git $ docker image build -t oss:1.0 oss-emulator $ docker run --detach -p 8880:8880 --name oss-emulator oss:1.0 Setup environment variables. $ export OSS_BUCKET='my-bucket' $ export OSS_ENDPOINT='localhost:8880' $ export OSS_ACCESS_KEY_ID='AccessKeyID' $ export OSS_ACCESS_KEY_SECRET='AccessKeySecret' --- scripts/ci/before_install.sh | 1 + scripts/ci/install_oss.sh | 11 +++++++++++ tests/func/test_data_cloud.py | 14 +++++++------- 3 files changed, 19 insertions(+), 7 deletions(-) create mode 100644 scripts/ci/install_oss.sh diff --git a/scripts/ci/before_install.sh b/scripts/ci/before_install.sh index 8c08d573eb..1fe567c902 100644 --- a/scripts/ci/before_install.sh +++ b/scripts/ci/before_install.sh @@ -49,6 +49,7 @@ echo > env.sh if [ -n "$TRAVIS_OS_NAME" ] && [ "$TRAVIS_OS_NAME" != "osx" ] \ && [ "$TRAVIS_PULL_REQUEST" == "false" ]; then bash "$scriptdir/install_azurite.sh" + bash "$scriptdir/install_oss.sh" bash "$scriptdir/install_hadoop.sh" fi diff --git a/scripts/ci/install_oss.sh b/scripts/ci/install_oss.sh new file mode 100644 index 0000000000..1056d8672f --- /dev/null +++ b/scripts/ci/install_oss.sh @@ -0,0 +1,11 @@ +#!/bin/bash + +set -euo pipefail + +git clone https://github.com/nanaya-tachibana/oss-emulator.git +sudo docker image build -t oss:1.0 oss-emulator +sudo docker run --detach --restart always -p 8880:8880 --name oss-emulator oss:1.0 +echo "export OSS_BUCKET='my-bucket'" >> env.sh +echo "export OSS_ENDPOINT='localhost:8880'" >> env.sh +echo "export OSS_ACCESS_KEY_ID='AccessKeyID'" >> env.sh +echo "export OSS_ACCESS_KEY_SECRET='AccessKeySecret'" >> env.sh diff --git a/tests/func/test_data_cloud.py b/tests/func/test_data_cloud.py index d3ef285f62..3ca7ff083e 100644 --- a/tests/func/test_data_cloud.py +++ b/tests/func/test_data_cloud.py @@ -687,16 +687,16 @@ def _test(self): self._test_cloud(TEST_REMOTE) -class TestRemoteOSSCLI(TestDataCloudCLIBase): - def _should_test(self): - return _should_test_azure() +# class TestRemoteOSSCLI(TestDataCloudCLIBase): +# def _should_test(self): +# return _should_test_oss() - def _test(self): - url = get_oss_url() +# def _test(self): +# url = get_oss_url() - self.main(["remote", "add", TEST_REMOTE, url]) +# self.main(["remote", "add", TEST_REMOTE, url]) - self._test_cloud(TEST_REMOTE) +# self._test_cloud(TEST_REMOTE) class TestDataCloudErrorCLI(TestDvc): From 740d05920b7bf893230e3d1e03bf4eb50a7fcb13 Mon Sep 17 00:00:00 2001 From: nanaya Date: Sat, 4 May 2019 18:04:54 +0800 Subject: [PATCH 04/10] Use default key id and key secret when they are not given, which gives read access to public read bucket and public bucket. --- dvc/remote/oss.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/dvc/remote/oss.py b/dvc/remote/oss.py index 89eff024e9..f513c56f15 100644 --- a/dvc/remote/oss.py +++ b/dvc/remote/oss.py @@ -78,17 +78,17 @@ def __init__(self, repo, config): path = match.group("path") self.prefix = urlparse(self.url).path.lstrip("/") if path else "" - self.key_id = config.get( - Config.SECTION_OSS_ACCESS_KEY_ID - ) or os.getenv("OSS_ACCESS_KEY_ID") - if not self.key_id: - raise ValueError("oss access key id is missing") - - self.key_secret = config.get( - Config.SECTION_OSS_ACCESS_KEY_SECRET - ) or os.getenv("OSS_ACCESS_KEY_SECRET") - if not self.key_secret: - raise ValueError("oss access key secret is missing") + self.key_id = ( + config.get(Config.SECTION_OSS_ACCESS_KEY_ID) + or os.getenv("OSS_ACCESS_KEY_ID") + or "defaultId" + ) + + self.key_secret = ( + config.get(Config.SECTION_OSS_ACCESS_KEY_SECRET) + or os.getenv("OSS_ACCESS_KEY_SECRET") + or "defaultSecret" + ) self._bucket = None self.path_info = { From a954a7874f2ffb1e3ad0e01d18a5315509b7ec25 Mon Sep 17 00:00:00 2001 From: nanaya Date: Sat, 4 May 2019 20:47:00 +0800 Subject: [PATCH 05/10] test: add oss tests to appveyor. --- .appveyor.yml | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/.appveyor.yml b/.appveyor.yml index 21b0cefb65..cbed0d40b8 100644 --- a/.appveyor.yml +++ b/.appveyor.yml @@ -20,12 +20,17 @@ environment: secure: XN4jRtmGE5Bqg8pPZkwNs7kn3UEI73Rkldqc0MGsQISZBm5TNJZOPofDMc1QnUsf AZURE_STORAGE_CONTAINER_NAME: appveyor-tests AZURE_STORAGE_CONNECTION_STRING: DefaultEndpointsProtocol=http;AccountName=devstoreaccount1;AccountKey=Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==;BlobEndpoint=http://127.0.0.1:10000/devstoreaccount1; + OSS_BUCKET: my-bucket + OSS_ENDPOINT: localhost:50004 + OSS_ACCESS_KEY_ID: AccessKeyID + OSS_ACCESS_KEY_SECRET: AccessKeySecret matrix: - PYTHON: "C:\\Python27" PYTHON_VERSION: "2.7.x" # currently 2.7.9 PYTHON_ARCH: "32" + RUBY: C:\Ruby25\bin # Disabled as unnecessary # - PYTHON: "C:\\Python27-x64" @@ -33,7 +38,12 @@ environment: # PYTHON_ARCH: "64" install: - - ps: Install-Product node + - SET PATH=%RUBY%;%PATH% + - git clone -q https://github.com/nanaya-tachibana/oss-emulator.git + - gem install thor builder + - ps: cd oss-emulator + - ps: $OSSProcess = Start-Process -FilePath ruby -ArgumentList "bin\emulator -r store -p 50004" -PassThru -NoNewWindow + - ps: cd .. - npm install -g azurite - ps: $AzuriteProcess = Start-Process azurite-blob -PassThru - cinst wget @@ -66,6 +76,7 @@ after_test: on_finish: - ps: Stop-Process $AzuriteProcess + - ps: Stop-Process $OSSProcess artifacts: - path: dvc*.exe From 55efd59dac82e28f98090ab677922af41c035a48 Mon Sep 17 00:00:00 2001 From: nanaya Date: Tue, 7 May 2019 14:16:37 +0800 Subject: [PATCH 06/10] remove unneeded tests. --- tests/func/test_data_cloud.py | 22 ++++------------------ 1 file changed, 4 insertions(+), 18 deletions(-) diff --git a/tests/func/test_data_cloud.py b/tests/func/test_data_cloud.py index 3ca7ff083e..b39e57346b 100644 --- a/tests/func/test_data_cloud.py +++ b/tests/func/test_data_cloud.py @@ -45,6 +45,7 @@ TEST_AWS_REPO_BUCKET = "dvc-test" TEST_GCP_REPO_BUCKET = "dvc-test" +TEST_OSS_REPO_BUCKET = "dvc-test" def _should_test_aws(): @@ -107,8 +108,7 @@ def _should_test_oss(): return False return ( - os.getenv("OSS_BUCKET") - and os.getenv("OSS_ENDPOINT") + os.getenv("OSS_ENDPOINT") and os.getenv("OSS_ACCESS_KEY_ID") and os.getenv("OSS_ACCESS_KEY_SECRET") ) @@ -224,9 +224,7 @@ def get_azure_url(): def get_oss_storagepath(): - return "{}.{}/{}".format( - os.getenv("OSS_BUCKET"), os.getenv("OSS_ENDPOINT"), str(uuid.uuid4()) - ) + return "{}/{}".format(TEST_OSS_REPO_BUCKET, (uuid.uuid4())) def get_oss_url(): @@ -247,7 +245,7 @@ def test(self): ("ssh://user@localhost:/", RemoteSSH), ("http://localhost:8000/", RemoteHTTP), ("azure://ContainerName=mybucket;conn_string;", RemoteAzure), - ("oss://bucket.endpoint/path", RemoteOSS), + ("oss://mybucket/", RemoteOSS), (TestDvc.mkdtemp(), RemoteLOCAL), ] @@ -687,18 +685,6 @@ def _test(self): self._test_cloud(TEST_REMOTE) -# class TestRemoteOSSCLI(TestDataCloudCLIBase): -# def _should_test(self): -# return _should_test_oss() - -# def _test(self): -# url = get_oss_url() - -# self.main(["remote", "add", TEST_REMOTE, url]) - -# self._test_cloud(TEST_REMOTE) - - class TestDataCloudErrorCLI(TestDvc): def main_fail(self, args): ret = main(args) From 0c3a2ab9ff6e8bbf7748fba352d6ba56638685cf Mon Sep 17 00:00:00 2001 From: nanaya Date: Tue, 7 May 2019 14:18:30 +0800 Subject: [PATCH 07/10] remote: use s3 style url for oss storage and make endpoint a configurable value. Usage: $ dvc remote add myremote oss://my-bucket/path Set key id, key secret and endpoint using modify command $ dvc remote modify myremote oss_key_id my-key-id $ dvc remote modify myremote oss_key_secret my-key-secret $ dvc remote modify myremote oss_endpoint endpoint or environment variables $ export OSS_ACCESS_KEY_ID="my-key-id" $ export OSS_ACCESS_KEY_SECRET="my-key-secret" $ export OSS_ENDPOINT="endpoint" --- .appveyor.yml | 1 - dvc/config.py | 2 ++ dvc/remote/oss.py | 67 ++++++++++------------------------- scripts/ci/install_oss.sh | 1 - tests/unit/remote/test_oss.py | 3 +- 5 files changed, 23 insertions(+), 51 deletions(-) diff --git a/.appveyor.yml b/.appveyor.yml index cbed0d40b8..32f73ab603 100644 --- a/.appveyor.yml +++ b/.appveyor.yml @@ -20,7 +20,6 @@ environment: secure: XN4jRtmGE5Bqg8pPZkwNs7kn3UEI73Rkldqc0MGsQISZBm5TNJZOPofDMc1QnUsf AZURE_STORAGE_CONTAINER_NAME: appveyor-tests AZURE_STORAGE_CONNECTION_STRING: DefaultEndpointsProtocol=http;AccountName=devstoreaccount1;AccountKey=Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==;BlobEndpoint=http://127.0.0.1:10000/devstoreaccount1; - OSS_BUCKET: my-bucket OSS_ENDPOINT: localhost:50004 OSS_ACCESS_KEY_ID: AccessKeyID OSS_ACCESS_KEY_SECRET: AccessKeySecret diff --git a/dvc/config.py b/dvc/config.py index 08c8eb13c2..a2956d3d0b 100644 --- a/dvc/config.py +++ b/dvc/config.py @@ -230,6 +230,7 @@ class Config(object): # pylint: disable=too-many-instance-attributes # Alibabacloud oss options SECTION_OSS_ACCESS_KEY_ID = "oss_key_id" SECTION_OSS_ACCESS_KEY_SECRET = "oss_key_secret" + SECTION_OSS_ENDPOINT = "oss_endpoint" SECTION_REMOTE_REGEX = r'^\s*remote\s*"(?P.*)"\s*$' SECTION_REMOTE_FMT = 'remote "{}"' @@ -260,6 +261,7 @@ class Config(object): # pylint: disable=too-many-instance-attributes Optional(SECTION_AZURE_CONNECTION_STRING): str, Optional(SECTION_OSS_ACCESS_KEY_ID): str, Optional(SECTION_OSS_ACCESS_KEY_SECRET): str, + Optional(SECTION_OSS_ENDPOINT): str, Optional(PRIVATE_CWD): str, } diff --git a/dvc/remote/oss.py b/dvc/remote/oss.py index f513c56f15..7615ee5352 100644 --- a/dvc/remote/oss.py +++ b/dvc/remote/oss.py @@ -2,8 +2,6 @@ from __future__ import unicode_literals import os -import re -import string import logging try: @@ -30,19 +28,19 @@ class RemoteOSS(RemoteBase): Examples ---------- - $ dvc remote add myremote oss://my-bucket.endpoint/path - Set key id and key secret using modify command + $ dvc remote add myremote oss://my-bucket/path + Set key id, key secret and endpoint using modify command $ dvc remote modify myremote oss_key_id my-key-id $ dvc remote modify myremote oss_key_secret my-key-secret + $ dvc remote modify myremote oss_endpoint endpoint or environment variables $ export OSS_ACCESS_KEY_ID="my-key-id" $ export OSS_ACCESS_KEY_SECRET="my-key-secret" + $ export OSS_ENDPOINT="endpoint" """ scheme = "oss" - REGEX = ( - r"^oss://(?P(?P.*?)\.(?P.*?))?(?P/.*)?$" - ) + REGEX = r"^oss://(?P.*)?$" REQUIRES = {"oss2": oss2} PARAM_CHECKSUM = "etag" COPY_POLL_SECONDS = 5 @@ -51,32 +49,13 @@ def __init__(self, repo, config): super(RemoteOSS, self).__init__(repo, config) self.url = config.get(Config.SECTION_REMOTE_URL) - match = re.match(self.REGEX, self.url) # backward compatibility - - self.bucket = match.group("bucket") or os.getenv("OSS_BUCKET") - if not self.bucket: - raise ValueError("oss bucket name is missing") - self.bucket = self.bucket.lower() - valid_chars = set(string.ascii_lowercase) | set(string.digits) | {"-"} - if ( - len(set(self.bucket) - valid_chars) != 0 - or self.bucket[0] == "-" - or self.bucket[-1] == "-" - or len(self.bucket) < 3 - or len(self.bucket) > 63 - ): - raise ValueError( - "oss bucket name should only contrains lowercase " - "alphabet letters, digits and - with length " - "between 3 and 63" - ) - - self.endpoint = match.group("endpoint") or os.getenv("OSS_ENDPOINT") - if not self.endpoint: - raise ValueError("oss endpoint is missing") + parsed = urlparse(self.url) + self.bucket = parsed.netloc + self.prefix = parsed.path.lstrip("/") - path = match.group("path") - self.prefix = urlparse(self.url).path.lstrip("/") if path else "" + self.endpoint = config.get(Config.SECTION_OSS_ENDPOINT) or os.getenv( + "OSS_ENDPOINT" + ) self.key_id = ( config.get(Config.SECTION_OSS_ACCESS_KEY_ID) @@ -91,11 +70,7 @@ def __init__(self, repo, config): ) self._bucket = None - self.path_info = { - "scheme": self.scheme, - "bucket": self.bucket, - "endpoint": self.endpoint, - } + self.path_info = {"scheme": self.scheme, "bucket": self.bucket} @property def oss_service(self): @@ -122,8 +97,8 @@ def remove(self, path_info): raise NotImplementedError logger.debug( - "Removing oss://{}.{}/{}".format( - path_info["bucket"], path_info["endpoint"], path_info["path"] + "Removing oss://{}/{}".format( + path_info["bucket"], path_info["path"] ) ) @@ -148,11 +123,10 @@ def upload(self, from_infos, to_infos, names=None, no_progress_bar=False): bucket = to_info["bucket"] path = to_info["path"] - endpoint = to_info["endpoint"] logger.debug( - "Uploading '{}' to 'oss://{}.{}/{}'".format( - from_info["path"], bucket, endpoint, path + "Uploading '{}' to 'oss://{}/{}'".format( + from_info["path"], bucket, path ) ) @@ -188,11 +162,10 @@ def download( bucket = from_info["bucket"] path = from_info["path"] - endpoint = from_info["endpoint"] logger.debug( - "Downloading 'oss://{}.{}/{}' to '{}'".format( - bucket, endpoint, path, to_info["path"] + "Downloading 'oss://{}/{}' to '{}'".format( + bucket, path, to_info["path"] ) ) @@ -209,9 +182,7 @@ def download( path, tmp_file, progress_callback=cb ) except Exception: - msg = "failed to download 'oss://{}.{}/{}'".format( - bucket, endpoint, path - ) + msg = "failed to download 'oss://{}/{}'".format(bucket, path) logger.warning(msg) else: move(tmp_file, to_info["path"]) diff --git a/scripts/ci/install_oss.sh b/scripts/ci/install_oss.sh index 1056d8672f..a845025a80 100644 --- a/scripts/ci/install_oss.sh +++ b/scripts/ci/install_oss.sh @@ -5,7 +5,6 @@ set -euo pipefail git clone https://github.com/nanaya-tachibana/oss-emulator.git sudo docker image build -t oss:1.0 oss-emulator sudo docker run --detach --restart always -p 8880:8880 --name oss-emulator oss:1.0 -echo "export OSS_BUCKET='my-bucket'" >> env.sh echo "export OSS_ENDPOINT='localhost:8880'" >> env.sh echo "export OSS_ACCESS_KEY_ID='AccessKeyID'" >> env.sh echo "export OSS_ACCESS_KEY_SECRET='AccessKeySecret'" >> env.sh diff --git a/tests/unit/remote/test_oss.py b/tests/unit/remote/test_oss.py index 3adeebe958..e29277c96d 100644 --- a/tests/unit/remote/test_oss.py +++ b/tests/unit/remote/test_oss.py @@ -11,11 +11,12 @@ class TestRemoteOSS(TestCase): def test_init(self): prefix = "some/prefix" - url = "oss://{}.{}/{}".format(self.bucket_name, self.endpoint, prefix) + url = "oss://{}/{}".format(self.bucket_name, prefix) config = { "url": url, "oss_key_id": self.key_id, "oss_key_secret": self.key_secret, + "oss_endpoint": self.endpoint, } remote = RemoteOSS(None, config) self.assertEqual(remote.url, url) From 12bd31323afc50b1e869959c2e0ff107c000a228 Mon Sep 17 00:00:00 2001 From: Ruslan Kuprieiev Date: Thu, 9 May 2019 15:02:22 +0300 Subject: [PATCH 08/10] remote: fallback to [] if there are no cinfos Signed-off-by: Ruslan Kuprieiev --- dvc/remote/base.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/dvc/remote/base.py b/dvc/remote/base.py index 373c151a3f..8bd3cac835 100644 --- a/dvc/remote/base.py +++ b/dvc/remote/base.py @@ -487,7 +487,10 @@ def gc(self, cinfos): used = {info[RemoteLOCAL.PARAM_CHECKSUM] for info in cinfos["local"]} if self.scheme != "": - used |= {info[self.PARAM_CHECKSUM] for info in cinfos[self.scheme]} + used |= { + info[self.PARAM_CHECKSUM] + for info in cinfos.get(self.scheme, []) + } removed = False for checksum in self.all(): From ce74e2ecb5f735eb07ade8b3c69e904336ea47a3 Mon Sep 17 00:00:00 2001 From: Ruslan Kuprieiev Date: Thu, 9 May 2019 15:04:49 +0300 Subject: [PATCH 09/10] test: remote: add oss CLI test Signed-off-by: Ruslan Kuprieiev --- tests/func/test_data_cloud.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/tests/func/test_data_cloud.py b/tests/func/test_data_cloud.py index b39e57346b..66a07ad449 100644 --- a/tests/func/test_data_cloud.py +++ b/tests/func/test_data_cloud.py @@ -685,6 +685,18 @@ def _test(self): self._test_cloud(TEST_REMOTE) +class TestRemoteOSSCLI(TestDataCloudCLIBase): + def _should_test(self): + return _should_test_oss() + + def _test(self): + url = get_oss_url() + + self.main(["remote", "add", TEST_REMOTE, url]) + + self._test_cloud(TEST_REMOTE) + + class TestDataCloudErrorCLI(TestDvc): def main_fail(self, args): ret = main(args) From 47400bba6f9de3345f1086fcd85948a071c32957 Mon Sep 17 00:00:00 2001 From: Ruslan Kuprieiev Date: Thu, 9 May 2019 15:14:56 +0300 Subject: [PATCH 10/10] travis: use iterative's fork of oss-emulator Just to keep things in-house. Signed-off-by: Ruslan Kuprieiev --- scripts/ci/install_oss.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/ci/install_oss.sh b/scripts/ci/install_oss.sh index a845025a80..11f4da7cd0 100644 --- a/scripts/ci/install_oss.sh +++ b/scripts/ci/install_oss.sh @@ -2,7 +2,7 @@ set -euo pipefail -git clone https://github.com/nanaya-tachibana/oss-emulator.git +git clone https://github.com/iterative/oss-emulator.git sudo docker image build -t oss:1.0 oss-emulator sudo docker run --detach --restart always -p 8880:8880 --name oss-emulator oss:1.0 echo "export OSS_ENDPOINT='localhost:8880'" >> env.sh