Skip to content

Commit

Permalink
Import Tale from a zip serialization
Browse files Browse the repository at this point in the history
  • Loading branch information
Xarthisius committed Mar 27, 2019
1 parent 7815aad commit 9eacf44
Show file tree
Hide file tree
Showing 9 changed files with 694 additions and 35 deletions.
7 changes: 6 additions & 1 deletion plugin.cmake
Expand Up @@ -7,7 +7,12 @@ add_python_test(harvester
plugins/wholetale/test_list_files.txt
)
add_python_test(image PLUGIN wholetale)
add_python_test(tale PLUGIN wholetale)
add_python_test(tale
PLUGIN wholetale
EXTERNAL_DATA
plugins/wholetale/5c92fbd472a9910001fbff72.zip
plugins/wholetale/tale_import_zip.txt
)
add_python_test(instance PLUGIN wholetale)
add_python_test(constants PLUGIN wholetale)
add_python_test(utils PLUGIN wholetale)
Expand Down
Binary file added plugin_tests/data/5c92fbd472a9910001fbff72.zip
Binary file not shown.
428 changes: 428 additions & 0 deletions plugin_tests/data/tale_import_zip.txt

Large diffs are not rendered by default.

47 changes: 39 additions & 8 deletions plugin_tests/tale_test.py
Expand Up @@ -7,9 +7,9 @@
import time
import urllib.request
import tempfile
import vcr
import zipfile
import shutil
from bson import ObjectId
from tests import base
from .tests_helpers import mockOtherRequest
from girder.models.item import Item
Expand Down Expand Up @@ -172,10 +172,9 @@ def testTaleFlow(self):
'licenseSPDX': taleLicense,
'publishInfo': [
{

'pid': 'published_pid',
'uri': 'published_url',
'date': '2019-01-23T15:48:17.476000+00:00',
'pid': 'published_pid',
'uri': 'published_url',
'date': '2019-01-23T15:48:17.476000+00:00',
}
]
})
Expand Down Expand Up @@ -560,6 +559,38 @@ def testTaleImport(self, it):
self.assertEqual(job_call['kwargs'], {'spawn': False})
self.assertEqual(job_call['headers']['girder_job_title'], 'Import Tale')

@vcr.use_cassette(os.path.join(DATA_PATH, 'tale_import_zip.txt'))
def testTaleImportZip(self):
image = self.model('image', 'wholetale').createImage(
name="Jupyter Classic", creator=self.user, public=True,
config=dict(template='base.tpl', buildpack='PythonBuildPack',
user='someUser', port=8888, urlPath=''))
with mock.patch('fs.copy.copy_fs') as mock_copy:
with open(os.path.join(DATA_PATH, '5c92fbd472a9910001fbff72.zip'), 'rb') as fp:
resp = self.request(
path='/tale/import', method='POST', user=self.user,
type='application/zip',
body=fp.read(),
)

self.assertStatusOk(resp)
job = resp.json

from girder.plugins.jobs.models.job import Job
self.assertEqual(job['type'], 'wholetale.import_tale')
for i in range(300):
if job['status'] in {JobStatus.SUCCESS, JobStatus.ERROR}:
break
time.sleep(0.1)
job = Job().load(job['_id'], force=True)
self.assertEqual(job['status'], JobStatus.SUCCESS)
mock_copy.assert_called_once()
# TODO: make it more extensive...
self.assertTrue(
self.model('tale', 'wholetale').findOne({'title': 'Water Tale'}) is not None
)
self.model('image', 'wholetale').remove(image)

def testTaleUpdate(self):
from server.lib.license import WholeTaleLicense
# Test that Tale updating works
Expand Down Expand Up @@ -597,9 +628,9 @@ def testTaleUpdate(self):
'licenseSPDX': taleLicense,
'publishInfo': [
{
'pid': 'published_pid',
'uri': 'published_url',
'date': '2019-01-23T15:48:17.476000+00:00',
'pid': 'published_pid',
'uri': 'published_url',
'date': '2019-01-23T15:48:17.476000+00:00',
}
]
})
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Expand Up @@ -9,3 +9,4 @@ dataone.common==3.2.0
dataone.libclient==3.2.0
dataone.cli==3.2.0
validators
fs.webdavfs
104 changes: 79 additions & 25 deletions server/rest/tale.py
Expand Up @@ -2,8 +2,10 @@
# -*- coding: utf-8 -*-
import cherrypy
from datetime import datetime, timedelta
import json
import time
import tempfile
import zipfile

from girder.api import access
from girder.api.rest import iterBody
Expand All @@ -13,12 +15,15 @@
setResponseHeader, setContentDisposition

from girder.constants import AccessType, SortDir, TokenScope
from girder.utility import assetstore_utilities
from girder.utility.path import getResourcePath
from girder.utility.progress import ProgressContext
from girder.models.assetstore import Assetstore
from girder.models.token import Token
from girder.models.folder import Folder
from girder.models.user import User
from girder.plugins.jobs.constants import REST_CREATE_JOB_TOKEN_SCOPE
from girder.plugins.jobs.models.job import Job
from gwvolman.tasks import import_tale, build_tale_image

from girder.plugins.jobs.constants import JobStatus
Expand Down Expand Up @@ -178,8 +183,8 @@ def deleteTale(self, tale, progress):
'of the query string. The file will be stored in a temporary '
'space. However, it is not currently being processed in any '
'way.')
.param('imageId', "The ID of the tale's image.", required=True)
.param('url', 'External dataset identifier.', required=True)
.param('imageId', "The ID of the tale's image.", required=False)
.param('url', 'External dataset identifier.', required=False)
.param('spawn', 'If false, create only Tale object without a corresponding '
'Instance.',
default=True, required=False, dataType='boolean')
Expand All @@ -192,35 +197,84 @@ def deleteTale(self, tale, progress):
)
def createTaleFromDataset(self, imageId, url, spawn, lookupKwargs, taleKwargs):
user = self.getCurrentUser()
image = imageModel().load(imageId, user=user, level=AccessType.READ,
exc=True)
token = self.getCurrentToken()
Token().addScope(token, scope=REST_CREATE_JOB_TOKEN_SCOPE)

if cherrypy.request.body.length > 0:
with tempfile.NamedTemporaryFile() as fp:
if cherrypy.request.headers.get('Content-Type') == 'application/zip':
# TODO: Move assetstore type to wholetale.
assetstore = next((_ for _ in Assetstore().list() if _['type'] == 101), None)
if assetstore:
adapter = assetstore_utilities.getAssetstoreAdapter(assetstore)
tempDir = adapter.tempDir
else:
tempDir = None

with tempfile.NamedTemporaryFile(dir=tempDir) as fp:
for chunk in iterBody(2 * 1024 ** 3):
fp.write(chunk)
fp.seek(0)
# TODO: check if zip, check if Tale
# shortcircut here either by running a separate job
# or modifying import_tale to grab a file

try:
lookupKwargs['dataId'] = [url]
except TypeError:
lookupKwargs = dict(dataId=[url])

try:
taleKwargs['imageId'] = str(image['_id'])
except TypeError:
taleKwargs = dict(imageId=str(image['_id']))

taleTask = import_tale.delay(
lookupKwargs, taleKwargs, spawn=spawn,
girder_client_token=str(token['_id'])
)
return taleTask.job
if not zipfile.is_zipfile(fp):
raise RestException("Provided file is not a zipfile")

with zipfile.ZipFile(fp) as z:
manifest_file = next(
(_ for _ in z.namelist() if _.endswith('manifest.json')),
None
)
if not manifest_file:
raise RestException("Provided file doesn't contain a Tale")

try:
manifest = json.loads(z.read(manifest_file).decode())
# TODO: is there a better check?
manifest['@id'].startswith('https://data.wholetale.org')
except Exception as e:
raise RestException(
"Couldn't read manifest.json or not a Tale: {}".format(str(e))
)

# Extract files to tmp on workspace assetstore
temp_dir = tempfile.mkdtemp(dir=tempDir)
# In theory malicious content like: abs path for a member, or relative path with
# ../.. etc., is taken care of by zipfile.extractall, but in the end we're still
# unzipping an untrusted content. What could possibly go wrong...?
z.extractall(path=temp_dir)

job = Job().createLocalJob(
title='Import Tale from zip', user=user,
type='wholetale.import_tale', public=False, async=True,
module='girder.plugins.wholetale.tasks.import_tale',
args=(temp_dir, manifest_file),
kwargs={'user': user, 'token': token}
)
Job().scheduleJob(job)
return job
else:
if not (imageId or url):
msg = (
"You need to provide either a zipfile with an exported Tale or "
" both 'imageId' and 'url' parameters."
)
raise RestException(msg)

image = imageModel().load(imageId, user=user, level=AccessType.READ,
exc=True)

try:
lookupKwargs['dataId'] = [url]
except TypeError:
lookupKwargs = dict(dataId=[url])

try:
taleKwargs['imageId'] = str(image['_id'])
except TypeError:
taleKwargs = dict(imageId=str(image['_id']))

taleTask = import_tale.delay(
lookupKwargs, taleKwargs, spawn=spawn,
girder_client_token=str(token['_id'])
)
return taleTask.job

@access.user
@autoDescribeRoute(
Expand Down
Empty file added server/tasks/__init__.py
Empty file.
139 changes: 139 additions & 0 deletions server/tasks/import_tale.py
@@ -0,0 +1,139 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import json
import os
import pathlib
import sys
import traceback
from webdavfs.webdavfs import WebDAVFS
from fs.osfs import OSFS
from fs.copy import copy_fs
from girder.models.file import File
from girder.utility import config
from girder.plugins.jobs.constants import JobStatus
from girder.plugins.jobs.models.job import Job

from ..constants import CATALOG_NAME
from ..lib import pids_to_entities, register_dataMap
from ..lib.license import WholeTaleLicense
from ..lib.dataone import DataONELocations # TODO: get rid of it
from ..models.image import Image
from ..models.tale import Tale
from ..utils import getOrCreateRootFolder


def run(job):
jobModel = Job()
jobModel.updateJob(job, status=JobStatus.RUNNING)

tale_dir, manifest_file = job["args"]
user = job["kwargs"]["user"]
token = job["kwargs"]["token"]

try:
os.chdir(tale_dir)
with open(manifest_file, 'r') as manifest_fp:
manifest = json.load(manifest_fp)

# Load the environment description
env_file = os.path.join(os.path.dirname(manifest_file), "environment.json")
with open(env_file, 'r') as env_fp:
environment = json.load(env_fp)

# 1. Register data
dataIds = [_['identifier'] for _ in manifest["Datasets"]]
if dataIds:
jobModel.updateJob(job, status=JobStatus.RUNNING, log="Registering external data")
dataMap = pids_to_entities(
dataIds, user=user, base_url=DataONELocations.prod_cn, lookup=True
) # DataONE shouldn't be here
register_dataMap(
dataMap,
getOrCreateRootFolder(CATALOG_NAME),
'folder',
user=user,
base_url=DataONELocations.prod_cn,
)

# 2. Construct the dataSet
dataSet = []
for obj in manifest['aggregates']:
if 'bundledAs' not in obj:
continue
uri = obj['uri']
fobj = File().findOne({'linkUrl': uri}) # TODO: That's expensive, use something else
if fobj:
dataSet.append({
'itemId': fobj['itemId'],
'_modelType': 'item',
'mountPath': obj['bundledAs']['filename']
})
# TODO: handle folders

# 3. Create a Tale
jobModel.updateJob(job, status=JobStatus.RUNNING, log="Creating a Tale object")
image = Image().findOne(
{"name": environment["name"]}
) # TODO: create if necessary, for now assume we have it.
image = Image().filter(image, user)
icon = image.get(
"icon",
(
"https://raw.githubusercontent.com/"
"whole-tale/dashboard/master/public/"
"images/whole_tale_logo.png"
),
)
licenseSPDX = next(
(
_["schema:license"]
for _ in manifest["aggregates"]
if "schema:license" in _
),
WholeTaleLicense.default_spdx(),
)
authors = " ".join((user["firstName"], user["lastName"]))

tale = Tale().createTale(
image,
dataSet,
creator=user,
save=True,
title=manifest["schema:name"],
description=manifest["schema:description"],
public=False,
config={},
icon=icon,
illustration=manifest["schema:image"],
authors=authors,
category=manifest["schema:category"],
licenseSPDX=licenseSPDX,
)

# 4. Copy data to the workspace using WebDAVFS (if it exists)
jobModel.updateJob(
job, status=JobStatus.RUNNING, log="Copying files to workspace"
)
orig_tale_id = pathlib.Path(manifest_file).parts[0]
for workdir in ("workspace", "data/workspace", None):
if workdir:
workdir = os.path.join(orig_tale_id, workdir)
if os.path.isdir(workdir):
break

if workdir:
password = "token:{_id}".format(**token)
root = "/tales/{_id}".format(**tale)
url = "http://localhost:{}".format(config.getConfig()['server.socket_port'])
with WebDAVFS(
url, login=user["login"], password=password, root=root
) as webdav_handle:
copy_fs(OSFS(workdir), webdav_handle)

jobModel.updateJob(job, status=JobStatus.SUCCESS, log="Tale created")
except Exception:
t, val, tb = sys.exc_info()
log = "%s: %s\n%s" % (t.__name__, repr(val), traceback.extract_tb(tb))
jobModel.updateJob(job, status=JobStatus.ERROR, log=log)
raise
3 changes: 2 additions & 1 deletion setup.cfg
Expand Up @@ -54,4 +54,5 @@ exclude: girder/external/*
# N803 - Argument name should be lowercase.
# N806 - Variable in function should be lowercase.
# N812 - Lowercase imported as non lowercase.
ignore: D100,D101,D102,D103,D104,D105,D107,D200,D201,D202,D203,D204,D205,D400,D401,D402,E123,E226,E241,N802,N803,N806,N812
# W606 - 'async' and 'await' are reserved keywords starting with Python 3.7
ignore: D100,D101,D102,D103,D104,D105,D107,D200,D201,D202,D203,D204,D205,D400,D401,D402,E123,E226,E241,N802,N803,N806,N812,W606

0 comments on commit 9eacf44

Please sign in to comment.