Skip to content

Commit

Permalink
Merge pull request #259 from whole-tale/import_tale_as_a_file
Browse files Browse the repository at this point in the history
Accept serialized Tale in POST /tale/import
  • Loading branch information
Xarthisius committed Jun 18, 2019
2 parents 6b43237 + 070ae58 commit 0e9db92
Show file tree
Hide file tree
Showing 8 changed files with 710 additions and 20 deletions.
7 changes: 6 additions & 1 deletion plugin.cmake
Expand Up @@ -7,7 +7,12 @@ add_python_test(harvester
plugins/wholetale/test_list_files.txt
)
add_python_test(image PLUGIN wholetale)
add_python_test(tale PLUGIN wholetale)
add_python_test(tale
PLUGIN wholetale
EXTERNAL_DATA
plugins/wholetale/5c92fbd472a9910001fbff72.zip
plugins/wholetale/tale_import_zip.txt
)
add_python_test(instance PLUGIN wholetale)
add_python_test(constants PLUGIN wholetale)
add_python_test(utils PLUGIN wholetale)
Expand Down
Binary file added plugin_tests/data/5c92fbd472a9910001fbff72.zip
Binary file not shown.
428 changes: 428 additions & 0 deletions plugin_tests/data/tale_import_zip.txt

Large diffs are not rendered by default.

33 changes: 33 additions & 0 deletions plugin_tests/tale_test.py
Expand Up @@ -10,6 +10,7 @@
import time
import urllib.request
import tempfile
import vcr
import zipfile
import shutil
from tests import base
Expand Down Expand Up @@ -537,6 +538,38 @@ def testTaleImport(self, it):
self.assertEqual(job_call['kwargs'], {'spawn': False})
self.assertEqual(job_call['headers']['girder_job_title'], 'Import Tale')

@vcr.use_cassette(os.path.join(DATA_PATH, 'tale_import_zip.txt'))
def testTaleImportZip(self):
image = self.model('image', 'wholetale').createImage(
name="Jupyter Classic", creator=self.user, public=True,
config=dict(template='base.tpl', buildpack='PythonBuildPack',
user='someUser', port=8888, urlPath=''))
with mock.patch('fs.copy.copy_fs') as mock_copy:
with open(os.path.join(DATA_PATH, '5c92fbd472a9910001fbff72.zip'), 'rb') as fp:
resp = self.request(
path='/tale/import', method='POST', user=self.user,
type='application/zip',
body=fp.read(),
)

self.assertStatusOk(resp)
job = resp.json

from girder.plugins.jobs.models.job import Job
self.assertEqual(job['type'], 'wholetale.import_tale')
for i in range(300):
if job['status'] in {JobStatus.SUCCESS, JobStatus.ERROR}:
break
time.sleep(0.1)
job = Job().load(job['_id'], force=True)
self.assertEqual(job['status'], JobStatus.SUCCESS)
mock_copy.assert_called_once()
# TODO: make it more extensive...
self.assertTrue(
self.model('tale', 'wholetale').findOne({'title': 'Water Tale'}) is not None
)
self.model('image', 'wholetale').remove(image)

def testTaleUpdate(self):
from server.lib.license import WholeTaleLicense
# Test that Tale updating works
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Expand Up @@ -10,3 +10,4 @@ dataone.libclient==3.2.0
dataone.cli==3.2.0
validators
html2markdown
fs.webdavfs
106 changes: 88 additions & 18 deletions server/rest/tale.py
@@ -1,16 +1,24 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import cherrypy
import json
import tempfile
import time
import zipfile

from girder import events
from girder.api import access
from girder.api.rest import iterBody
from girder.api.docs import addModel
from girder.api.describe import Description, autoDescribeRoute
from girder.api.rest import Resource, filtermodel, RestException,\
setResponseHeader, setContentDisposition

from girder.constants import AccessType, SortDir, TokenScope
from girder.utility import assetstore_utilities
from girder.utility.path import getResourcePath
from girder.utility.progress import ProgressContext
from girder.models.assetstore import Assetstore
from girder.models.token import Token
from girder.models.folder import Folder
from girder.plugins.jobs.models.job import Job
Expand Down Expand Up @@ -179,9 +187,13 @@ def deleteTale(self, tale, progress):
@autoDescribeRoute(
Description('Create a new tale from an external dataset.')
.notes('Currently, this task only handles importing raw data. '
'In the future, it should also allow importing serialized Tales.')
.param('imageId', "The ID of the tale's image.", required=True)
.param('url', 'External dataset identifier.', required=True)
'A serialized Tale can be sent as the body of the request using an '
'appropriate content-type and with the other parameters as part '
'of the query string. The file will be stored in a temporary '
'space. However, it is not currently being processed in any '
'way.')
.param('imageId', "The ID of the tale's image.", required=False)
.param('url', 'External dataset identifier.', required=False)
.param('spawn', 'If false, create only Tale object without a corresponding '
'Instance.',
default=True, required=False, dataType='boolean')
Expand All @@ -194,29 +206,87 @@ def deleteTale(self, tale, progress):
)
def createTaleFromDataset(self, imageId, url, spawn, lookupKwargs, taleKwargs):
user = self.getCurrentUser()
image = imageModel().load(imageId, user=user, level=AccessType.READ,
exc=True)
token = Token().createToken(
user=user,
days=0.5,
scope=(TokenScope.USER_AUTH, REST_CREATE_JOB_TOKEN_SCOPE)
)

try:
lookupKwargs['dataId'] = [url]
except TypeError:
lookupKwargs = dict(dataId=[url])
if cherrypy.request.headers.get('Content-Type') == 'application/zip':
# TODO: Move assetstore type to wholetale.
assetstore = next((_ for _ in Assetstore().list() if _['type'] == 101), None)
if assetstore:
adapter = assetstore_utilities.getAssetstoreAdapter(assetstore)
tempDir = adapter.tempDir
else:
tempDir = None

with tempfile.NamedTemporaryFile(dir=tempDir) as fp:
for chunk in iterBody(2 * 1024 ** 3):
fp.write(chunk)
fp.seek(0)
if not zipfile.is_zipfile(fp):
raise RestException("Provided file is not a zipfile")

with zipfile.ZipFile(fp) as z:
manifest_file = next(
(_ for _ in z.namelist() if _.endswith('manifest.json')),
None
)
if not manifest_file:
raise RestException("Provided file doesn't contain a Tale")

try:
manifest = json.loads(z.read(manifest_file).decode())
# TODO: is there a better check?
manifest['@id'].startswith('https://data.wholetale.org')
except Exception as e:
raise RestException(
"Couldn't read manifest.json or not a Tale: {}".format(str(e))
)

# Extract files to tmp on workspace assetstore
temp_dir = tempfile.mkdtemp(dir=tempDir)
# In theory malicious content like: abs path for a member, or relative path with
# ../.. etc., is taken care of by zipfile.extractall, but in the end we're still
# unzipping an untrusted content. What could possibly go wrong...?
z.extractall(path=temp_dir)

job = Job().createLocalJob(
title='Import Tale from zip', user=user,
type='wholetale.import_tale', public=False, async=True,
module='girder.plugins.wholetale.tasks.import_tale',
args=(temp_dir, manifest_file),
kwargs={'user': user, 'token': token}
)
Job().scheduleJob(job)
return job
else:
if not (imageId or url):
msg = (
"You need to provide either a zipfile with an exported Tale or "
" both 'imageId' and 'url' parameters."
)
raise RestException(msg)

try:
taleKwargs['imageId'] = str(image['_id'])
except TypeError:
taleKwargs = dict(imageId=str(image['_id']))
image = imageModel().load(imageId, user=user, level=AccessType.READ,
exc=True)

taleTask = import_tale.delay(
lookupKwargs, taleKwargs, spawn=spawn,
girder_client_token=str(token['_id'])
)
return taleTask.job
try:
lookupKwargs['dataId'] = [url]
except TypeError:
lookupKwargs = dict(dataId=[url])

try:
taleKwargs['imageId'] = str(image['_id'])
except TypeError:
taleKwargs = dict(imageId=str(image['_id']))

taleTask = import_tale.delay(
lookupKwargs, taleKwargs, spawn=spawn,
girder_client_token=str(token['_id'])
)
return taleTask.job

@access.user
@autoDescribeRoute(
Expand Down
152 changes: 152 additions & 0 deletions server/tasks/import_tale.py
@@ -0,0 +1,152 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import json
import os
import pathlib
import sys
import traceback
from webdavfs.webdavfs import WebDAVFS
from fs.osfs import OSFS
from fs.copy import copy_fs
from girder.models.file import File
from girder.utility import config
from girder.plugins.jobs.constants import JobStatus
from girder.plugins.jobs.models.job import Job

from ..constants import CATALOG_NAME
from ..lib import pids_to_entities, register_dataMap
from ..lib.license import WholeTaleLicense
from ..lib.dataone import DataONELocations # TODO: get rid of it
from ..models.image import Image
from ..models.tale import Tale
from ..utils import getOrCreateRootFolder


def run(job):
jobModel = Job()
jobModel.updateJob(job, status=JobStatus.RUNNING)

tale_dir, manifest_file = job["args"]
user = job["kwargs"]["user"]
token = job["kwargs"]["token"]

try:
os.chdir(tale_dir)
with open(manifest_file, 'r') as manifest_fp:
manifest = json.load(manifest_fp)

# Load the environment description
env_file = os.path.join(os.path.dirname(manifest_file), "environment.json")
with open(env_file, 'r') as env_fp:
environment = json.load(env_fp)

# 1. Register data
dataIds = [_['identifier'] for _ in manifest["Datasets"]]
if dataIds:
jobModel.updateJob(
job, status=JobStatus.RUNNING, log="Registering external data"
)
dataMap = pids_to_entities(
dataIds, user=user, base_url=DataONELocations.prod_cn, lookup=True
) # DataONE shouldn't be here
register_dataMap(
dataMap,
getOrCreateRootFolder(CATALOG_NAME),
'folder',
user=user,
base_url=DataONELocations.prod_cn,
)

# 2. Construct the dataSet
dataSet = []
for obj in manifest['aggregates']:
if 'bundledAs' not in obj:
continue
uri = obj['uri']
fobj = File().findOne(
{'linkUrl': uri}
) # TODO: That's expensive, use something else
if fobj:
dataSet.append(
{
'itemId': fobj['itemId'],
'_modelType': 'item',
'mountPath': obj['bundledAs']['filename'],
}
)
# TODO: handle folders

# 3. Create a Tale
jobModel.updateJob(job, status=JobStatus.RUNNING, log="Creating a Tale object")
image = Image().findOne(
{"name": environment["name"]}
) # TODO: create if necessary, for now assume we have it.
image = Image().filter(image, user)
icon = image.get(
"icon",
(
"https://raw.githubusercontent.com/"
"whole-tale/dashboard/master/public/"
"images/whole_tale_logo.png"
),
)
licenseSPDX = next(
(
_["schema:license"]
for _ in manifest["aggregates"]
if "schema:license" in _
),
WholeTaleLicense.default_spdx(),
)
authors = [
{
"firstName": author["schema:givenName"],
"lastName": author["schema:familyName"],
"orcid": author["@id"],
}
for author in manifest["schema:author"]
]

tale = Tale().createTale(
image,
dataSet,
creator=user,
save=True,
title=manifest["schema:name"],
description=manifest["schema:description"],
public=False,
config={},
icon=icon,
illustration=manifest["schema:image"],
authors=authors,
category=manifest["schema:category"],
licenseSPDX=licenseSPDX,
)

# 4. Copy data to the workspace using WebDAVFS (if it exists)
jobModel.updateJob(
job, status=JobStatus.RUNNING, log="Copying files to workspace"
)
orig_tale_id = pathlib.Path(manifest_file).parts[0]
for workdir in ("workspace", "data/workspace", None):
if workdir:
workdir = os.path.join(orig_tale_id, workdir)
if os.path.isdir(workdir):
break

if workdir:
password = "token:{_id}".format(**token)
root = "/tales/{_id}".format(**tale)
url = "http://localhost:{}".format(config.getConfig()['server.socket_port'])
with WebDAVFS(
url, login=user["login"], password=password, root=root
) as webdav_handle:
copy_fs(OSFS(workdir), webdav_handle)

jobModel.updateJob(job, status=JobStatus.SUCCESS, log="Tale created")
except Exception:
t, val, tb = sys.exc_info()
log = "%s: %s\n%s" % (t.__name__, repr(val), traceback.extract_tb(tb))
jobModel.updateJob(job, status=JobStatus.ERROR, log=log)
raise
3 changes: 2 additions & 1 deletion setup.cfg
Expand Up @@ -54,4 +54,5 @@ exclude: girder/external/*
# N803 - Argument name should be lowercase.
# N806 - Variable in function should be lowercase.
# N812 - Lowercase imported as non lowercase.
ignore: D100,D101,D102,D103,D104,D105,D107,D200,D201,D202,D203,D204,D205,D400,D401,D402,E123,E226,E241,N802,N803,N806,N812, W606
# W606 - 'async' and 'await' are reserved keywords starting with Python 3.7
ignore: D100,D101,D102,D103,D104,D105,D107,D200,D201,D202,D203,D204,D205,D400,D401,D402,E123,E226,E241,N802,N803,N806,N812,W606

0 comments on commit 0e9db92

Please sign in to comment.