diff --git a/.appveyor.yml b/.appveyor.yml index 21b0cefb65..32f73ab603 100644 --- a/.appveyor.yml +++ b/.appveyor.yml @@ -20,12 +20,16 @@ environment: secure: XN4jRtmGE5Bqg8pPZkwNs7kn3UEI73Rkldqc0MGsQISZBm5TNJZOPofDMc1QnUsf AZURE_STORAGE_CONTAINER_NAME: appveyor-tests AZURE_STORAGE_CONNECTION_STRING: DefaultEndpointsProtocol=http;AccountName=devstoreaccount1;AccountKey=Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==;BlobEndpoint=http://127.0.0.1:10000/devstoreaccount1; + OSS_ENDPOINT: localhost:50004 + OSS_ACCESS_KEY_ID: AccessKeyID + OSS_ACCESS_KEY_SECRET: AccessKeySecret matrix: - PYTHON: "C:\\Python27" PYTHON_VERSION: "2.7.x" # currently 2.7.9 PYTHON_ARCH: "32" + RUBY: C:\Ruby25\bin # Disabled as unnecessary # - PYTHON: "C:\\Python27-x64" @@ -33,7 +37,12 @@ environment: # PYTHON_ARCH: "64" install: - - ps: Install-Product node + - SET PATH=%RUBY%;%PATH% + - git clone -q https://github.com/nanaya-tachibana/oss-emulator.git + - gem install thor builder + - ps: cd oss-emulator + - ps: $OSSProcess = Start-Process -FilePath ruby -ArgumentList "bin\emulator -r store -p 50004" -PassThru -NoNewWindow + - ps: cd .. - npm install -g azurite - ps: $AzuriteProcess = Start-Process azurite-blob -PassThru - cinst wget @@ -66,6 +75,7 @@ after_test: on_finish: - ps: Stop-Process $AzuriteProcess + - ps: Stop-Process $OSSProcess artifacts: - path: dvc*.exe diff --git a/README.rst b/README.rst index 76cbf370b5..fabda7040f 100644 --- a/README.rst +++ b/README.rst @@ -24,7 +24,7 @@ .. image:: https://codecov.io/gh/iterative/dvc/branch/master/graph/badge.svg :target: https://codecov.io/gh/iterative/dvc :alt: Codecov - + .. image:: https://img.shields.io/badge/patreon-donate-green.svg :target: https://www.patreon.com/DVCorg/overview :alt: Donate @@ -38,7 +38,7 @@ machine learning projects. Key features: any databases. Does not depend on any proprietary online services; #. it manages and versions **datasets** and **machine learning models**. Data is saved in - S3, Google cloud, Azure, SSH server, HDFS or even local HDD RAID; + S3, Google cloud, Azure, Alibaba cloud, SSH server, HDFS or even local HDD RAID; #. it makes projects **reproducible** and **shareable**, it helps answering question how the model was build; @@ -63,7 +63,7 @@ made right and tailored specifically for ML and Data Science scenarios. #. ``Git/Git-lfs`` part - DVC helps you storing and sharing data artifacts, models. It connects them with your Git repository. -#. ``Makefiles`` part - DVC describes how one data or model artifact was build from another data. +#. ``Makefiles`` part - DVC describes how one data or model artifact was build from another data. DVC usually runs along with Git. Git is used as usual to store and version code and DVC meta-files. DVC helps to store data and model files seamlessly out of Git while preserving almost the same user experience as if they @@ -118,7 +118,7 @@ pip (PyPI) pip install dvc Depending on the remote storage type you plan to use to keep and share your data, you might need to specify -one of the optional dependencies: ``s3``, ``gs``, ``azure``, ``ssh``. Or ``all_remotes`` to include them all. +one of the optional dependencies: ``s3``, ``gs``, ``azure``, ``oss``, ``ssh``. Or ``all_remotes`` to include them all. The command should look like this: ``pip install dvc[s3]`` - it installs the ``boto3`` library along with DVC to support the AWS S3 storage. diff --git a/dvc/config.py b/dvc/config.py index 89fb00497f..a2956d3d0b 100644 --- a/dvc/config.py +++ b/dvc/config.py @@ -227,6 +227,10 @@ class Config(object): # pylint: disable=too-many-instance-attributes SECTION_LOCAL_SCHEMA = {SECTION_LOCAL_STORAGEPATH: str} SECTION_AZURE_CONNECTION_STRING = "connection_string" + # Alibabacloud oss options + SECTION_OSS_ACCESS_KEY_ID = "oss_key_id" + SECTION_OSS_ACCESS_KEY_SECRET = "oss_key_secret" + SECTION_OSS_ENDPOINT = "oss_endpoint" SECTION_REMOTE_REGEX = r'^\s*remote\s*"(?P.*)"\s*$' SECTION_REMOTE_FMT = 'remote "{}"' @@ -255,6 +259,9 @@ class Config(object): # pylint: disable=too-many-instance-attributes Optional(SECTION_REMOTE_PASSWORD): str, Optional(SECTION_REMOTE_ASK_PASSWORD): BOOL_SCHEMA, Optional(SECTION_AZURE_CONNECTION_STRING): str, + Optional(SECTION_OSS_ACCESS_KEY_ID): str, + Optional(SECTION_OSS_ACCESS_KEY_SECRET): str, + Optional(SECTION_OSS_ENDPOINT): str, Optional(PRIVATE_CWD): str, } diff --git a/dvc/data_cloud.py b/dvc/data_cloud.py index f9704336b9..73649f06be 100644 --- a/dvc/data_cloud.py +++ b/dvc/data_cloud.py @@ -9,6 +9,7 @@ from dvc.remote.s3 import RemoteS3 from dvc.remote.gs import RemoteGS from dvc.remote.azure import RemoteAzure +from dvc.remote.oss import RemoteOSS from dvc.remote.ssh import RemoteSSH from dvc.remote.hdfs import RemoteHDFS from dvc.remote.local import RemoteLOCAL @@ -34,6 +35,7 @@ class DataCloud(object): "aws": RemoteS3, "gcp": RemoteGS, "azure": RemoteAzure, + "oss": RemoteOSS, "ssh": RemoteSSH, "hdfs": RemoteHDFS, "local": RemoteLOCAL, diff --git a/dvc/remote/__init__.py b/dvc/remote/__init__.py index 94b274f4a4..5c35c038d8 100644 --- a/dvc/remote/__init__.py +++ b/dvc/remote/__init__.py @@ -7,6 +7,7 @@ from dvc.remote.s3 import RemoteS3 from dvc.remote.ssh import RemoteSSH from dvc.remote.http import RemoteHTTP +from dvc.remote.oss import RemoteOSS REMOTES = [ @@ -16,6 +17,7 @@ RemoteHTTP, RemoteS3, RemoteSSH, + RemoteOSS, # NOTE: RemoteLOCAL is the default ] diff --git a/dvc/remote/base.py b/dvc/remote/base.py index 373c151a3f..8bd3cac835 100644 --- a/dvc/remote/base.py +++ b/dvc/remote/base.py @@ -487,7 +487,10 @@ def gc(self, cinfos): used = {info[RemoteLOCAL.PARAM_CHECKSUM] for info in cinfos["local"]} if self.scheme != "": - used |= {info[self.PARAM_CHECKSUM] for info in cinfos[self.scheme]} + used |= { + info[self.PARAM_CHECKSUM] + for info in cinfos.get(self.scheme, []) + } removed = False for checksum in self.all(): diff --git a/dvc/remote/oss.py b/dvc/remote/oss.py new file mode 100644 index 0000000000..7615ee5352 --- /dev/null +++ b/dvc/remote/oss.py @@ -0,0 +1,191 @@ +from __future__ import absolute_import +from __future__ import unicode_literals + +import os +import logging + +try: + import oss2 +except ImportError: + oss2 = None + +from dvc.utils import tmp_fname, move +from dvc.utils.compat import urlparse, makedirs +from dvc.progress import progress +from dvc.config import Config +from dvc.remote.base import RemoteBase +from dvc.remote.azure import Callback + + +logger = logging.getLogger(__name__) + + +class RemoteOSS(RemoteBase): + """ + oss2 document: + https://www.alibabacloud.com/help/doc-detail/32026.htm + + + Examples + ---------- + $ dvc remote add myremote oss://my-bucket/path + Set key id, key secret and endpoint using modify command + $ dvc remote modify myremote oss_key_id my-key-id + $ dvc remote modify myremote oss_key_secret my-key-secret + $ dvc remote modify myremote oss_endpoint endpoint + or environment variables + $ export OSS_ACCESS_KEY_ID="my-key-id" + $ export OSS_ACCESS_KEY_SECRET="my-key-secret" + $ export OSS_ENDPOINT="endpoint" + """ + + scheme = "oss" + REGEX = r"^oss://(?P.*)?$" + REQUIRES = {"oss2": oss2} + PARAM_CHECKSUM = "etag" + COPY_POLL_SECONDS = 5 + + def __init__(self, repo, config): + super(RemoteOSS, self).__init__(repo, config) + + self.url = config.get(Config.SECTION_REMOTE_URL) + parsed = urlparse(self.url) + self.bucket = parsed.netloc + self.prefix = parsed.path.lstrip("/") + + self.endpoint = config.get(Config.SECTION_OSS_ENDPOINT) or os.getenv( + "OSS_ENDPOINT" + ) + + self.key_id = ( + config.get(Config.SECTION_OSS_ACCESS_KEY_ID) + or os.getenv("OSS_ACCESS_KEY_ID") + or "defaultId" + ) + + self.key_secret = ( + config.get(Config.SECTION_OSS_ACCESS_KEY_SECRET) + or os.getenv("OSS_ACCESS_KEY_SECRET") + or "defaultSecret" + ) + + self._bucket = None + self.path_info = {"scheme": self.scheme, "bucket": self.bucket} + + @property + def oss_service(self): + if self._bucket is None: + logger.debug("URL {}".format(self.url)) + logger.debug("key id {}".format(self.key_id)) + logger.debug("key secret {}".format(self.key_secret)) + auth = oss2.Auth(self.key_id, self.key_secret) + logger.debug("bucket name {}".format(self.bucket)) + self._bucket = oss2.Bucket(auth, self.endpoint, self.bucket) + try: # verify that bucket exists + self._bucket.get_bucket_info() + except oss2.exceptions.NoSuchBucket: + self._bucket.create_bucket( + oss2.BUCKET_ACL_PUBLIC_READ, + oss2.models.BucketCreateConfig( + oss2.BUCKET_STORAGE_CLASS_STANDARD + ), + ) + return self._bucket + + def remove(self, path_info): + if path_info["scheme"] != self.scheme: + raise NotImplementedError + + logger.debug( + "Removing oss://{}/{}".format( + path_info["bucket"], path_info["path"] + ) + ) + + self.oss_service.delete_object(path_info["path"]) + + def _list_paths(self, prefix): + for blob in oss2.ObjectIterator(self.oss_service, prefix=prefix): + yield blob.key + + def list_cache_paths(self): + return self._list_paths(self.prefix) + + def upload(self, from_infos, to_infos, names=None, no_progress_bar=False): + names = self._verify_path_args(to_infos, from_infos, names) + + for from_info, to_info, name in zip(from_infos, to_infos, names): + if to_info["scheme"] != self.scheme: + raise NotImplementedError + + if from_info["scheme"] != "local": + raise NotImplementedError + + bucket = to_info["bucket"] + path = to_info["path"] + + logger.debug( + "Uploading '{}' to 'oss://{}/{}'".format( + from_info["path"], bucket, path + ) + ) + + if not name: + name = os.path.basename(from_info["path"]) + + cb = None if no_progress_bar else Callback(name) + + try: + self.oss_service.put_object_from_file( + path, from_info["path"], progress_callback=cb + ) + except Exception: + msg = "failed to upload '{}'".format(from_info["path"]) + logger.warning(msg) + else: + progress.finish_target(name) + + def download( + self, + from_infos, + to_infos, + names=None, + no_progress_bar=False, + resume=False, + ): + names = self._verify_path_args(from_infos, to_infos, names) + for to_info, from_info, name in zip(to_infos, from_infos, names): + if from_info["scheme"] != self.scheme: + raise NotImplementedError + if to_info["scheme"] != "local": + raise NotImplementedError + + bucket = from_info["bucket"] + path = from_info["path"] + + logger.debug( + "Downloading 'oss://{}/{}' to '{}'".format( + bucket, path, to_info["path"] + ) + ) + + tmp_file = tmp_fname(to_info["path"]) + if not name: + name = os.path.basename(to_info["path"]) + + cb = None if no_progress_bar else Callback(name) + + makedirs(os.path.dirname(to_info["path"]), exist_ok=True) + + try: + self.oss_service.get_object_to_file( + path, tmp_file, progress_callback=cb + ) + except Exception: + msg = "failed to download 'oss://{}/{}'".format(bucket, path) + logger.warning(msg) + else: + move(tmp_file, to_info["path"]) + + if not no_progress_bar: + progress.finish_target(name) diff --git a/requirements.txt b/requirements.txt index d65d325cb0..eb59e5ce46 100644 --- a/requirements.txt +++ b/requirements.txt @@ -30,3 +30,4 @@ humanize>=0.5.1 dulwich>=0.19.11 ruamel.yaml>=0.15.91 pathlib2==2.3.3; python_version == "2.7" +oss2==2.6.1 diff --git a/scripts/ci/before_install.sh b/scripts/ci/before_install.sh index 8c08d573eb..1fe567c902 100644 --- a/scripts/ci/before_install.sh +++ b/scripts/ci/before_install.sh @@ -49,6 +49,7 @@ echo > env.sh if [ -n "$TRAVIS_OS_NAME" ] && [ "$TRAVIS_OS_NAME" != "osx" ] \ && [ "$TRAVIS_PULL_REQUEST" == "false" ]; then bash "$scriptdir/install_azurite.sh" + bash "$scriptdir/install_oss.sh" bash "$scriptdir/install_hadoop.sh" fi diff --git a/scripts/ci/install_oss.sh b/scripts/ci/install_oss.sh new file mode 100644 index 0000000000..11f4da7cd0 --- /dev/null +++ b/scripts/ci/install_oss.sh @@ -0,0 +1,10 @@ +#!/bin/bash + +set -euo pipefail + +git clone https://github.com/iterative/oss-emulator.git +sudo docker image build -t oss:1.0 oss-emulator +sudo docker run --detach --restart always -p 8880:8880 --name oss-emulator oss:1.0 +echo "export OSS_ENDPOINT='localhost:8880'" >> env.sh +echo "export OSS_ACCESS_KEY_ID='AccessKeyID'" >> env.sh +echo "export OSS_ACCESS_KEY_SECRET='AccessKeySecret'" >> env.sh diff --git a/setup.py b/setup.py index fb4a66eef0..d627f1d502 100644 --- a/setup.py +++ b/setup.py @@ -65,8 +65,9 @@ def run(self): gs = ["google-cloud-storage==1.13.0"] s3 = ["boto3==1.9.115"] azure = ["azure-storage-blob==1.3.0"] +oss = ["oss2==2.6.1"] ssh = ["paramiko>=2.4.1"] -all_remotes = gs + s3 + azure + ssh +all_remotes = gs + s3 + azure + ssh + oss setup( name="dvc", @@ -83,6 +84,7 @@ def run(self): "gs": gs, "s3": s3, "azure": azure, + "oss": oss, "ssh": ssh, # NOTE: https://github.com/inveniosoftware/troubleshooting/issues/1 ':python_version=="2.7"': ["futures", "pathlib2"], diff --git a/tests/func/test_data_cloud.py b/tests/func/test_data_cloud.py index d8a8a6921d..66a07ad449 100644 --- a/tests/func/test_data_cloud.py +++ b/tests/func/test_data_cloud.py @@ -20,6 +20,7 @@ RemoteS3, RemoteGS, RemoteAzure, + RemoteOSS, RemoteLOCAL, RemoteSSH, RemoteHDFS, @@ -44,6 +45,7 @@ TEST_AWS_REPO_BUCKET = "dvc-test" TEST_GCP_REPO_BUCKET = "dvc-test" +TEST_OSS_REPO_BUCKET = "dvc-test" def _should_test_aws(): @@ -99,6 +101,19 @@ def _should_test_azure(): ) +def _should_test_oss(): + if os.getenv("DVC_TEST_OSS") == "true": + return True + elif os.getenv("DVC_TEST_OSS") == "false": + return False + + return ( + os.getenv("OSS_ENDPOINT") + and os.getenv("OSS_ACCESS_KEY_ID") + and os.getenv("OSS_ACCESS_KEY_SECRET") + ) + + def _should_test_ssh(): if os.getenv("DVC_TEST_SSH") == "true": return True @@ -208,6 +223,14 @@ def get_azure_url(): return "azure://{}/{}".format(container_name, str(uuid.uuid4())) +def get_oss_storagepath(): + return "{}/{}".format(TEST_OSS_REPO_BUCKET, (uuid.uuid4())) + + +def get_oss_url(): + return "oss://{}".format(get_oss_storagepath()) + + class TestDataCloud(TestDvc): def _test_cloud(self, config, cl): cloud = DataCloud(self.dvc, config=config) @@ -222,6 +245,7 @@ def test(self): ("ssh://user@localhost:/", RemoteSSH), ("http://localhost:8000/", RemoteHTTP), ("azure://ContainerName=mybucket;conn_string;", RemoteAzure), + ("oss://mybucket/", RemoteOSS), (TestDvc.mkdtemp(), RemoteLOCAL), ] @@ -395,6 +419,17 @@ def _get_url(self): return get_azure_url() +class TestRemoteOSS(TestDataCloudBase): + def _should_test(self): + return _should_test_oss() + + def _get_url(self): + return get_oss_url() + + def _get_cloud_class(self): + return RemoteOSS + + class TestRemoteLOCAL(TestDataCloudBase): def _should_test(self): return True @@ -650,6 +685,18 @@ def _test(self): self._test_cloud(TEST_REMOTE) +class TestRemoteOSSCLI(TestDataCloudCLIBase): + def _should_test(self): + return _should_test_oss() + + def _test(self): + url = get_oss_url() + + self.main(["remote", "add", TEST_REMOTE, url]) + + self._test_cloud(TEST_REMOTE) + + class TestDataCloudErrorCLI(TestDvc): def main_fail(self, args): ret = main(args) diff --git a/tests/unit/remote/test_oss.py b/tests/unit/remote/test_oss.py new file mode 100644 index 0000000000..e29277c96d --- /dev/null +++ b/tests/unit/remote/test_oss.py @@ -0,0 +1,27 @@ +from unittest import TestCase + +from dvc.remote.oss import RemoteOSS + + +class TestRemoteOSS(TestCase): + bucket_name = "bucket-name" + endpoint = "endpoint" + key_id = "Fq2UVErCz4I6tq" + key_secret = "Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsu" + + def test_init(self): + prefix = "some/prefix" + url = "oss://{}/{}".format(self.bucket_name, prefix) + config = { + "url": url, + "oss_key_id": self.key_id, + "oss_key_secret": self.key_secret, + "oss_endpoint": self.endpoint, + } + remote = RemoteOSS(None, config) + self.assertEqual(remote.url, url) + self.assertEqual(remote.prefix, prefix) + self.assertEqual(remote.bucket, self.bucket_name) + self.assertEqual(remote.endpoint, self.endpoint) + self.assertEqual(remote.key_id, self.key_id) + self.assertEqual(remote.key_secret, self.key_secret)