Skip to content

Commit

Permalink
Import Tale from a zip serialization
Browse files Browse the repository at this point in the history
  • Loading branch information
Xarthisius committed Mar 23, 2019
1 parent 4863ad1 commit 0a5b4e8
Show file tree
Hide file tree
Showing 5 changed files with 215 additions and 26 deletions.
1 change: 1 addition & 0 deletions requirements.txt
Expand Up @@ -9,3 +9,4 @@ dataone.common==3.2.0
dataone.libclient==3.2.0
dataone.cli==3.2.0
validators
fs.webdavfs
100 changes: 75 additions & 25 deletions server/rest/tale.py
Expand Up @@ -5,6 +5,7 @@
from datetime import datetime, timedelta
import time
import tempfile
import zipfile

from girder.api import access
from girder.api.rest import iterBody
Expand All @@ -14,13 +15,15 @@
setResponseHeader, setContentDisposition

from girder.constants import AccessType, SortDir, TokenScope
from girder.utility import ziputil
from girder.utility import ziputil, assetstore_utilities
from girder.utility.path import getResourcePath
from girder.utility.progress import ProgressContext
from girder.models.assetstore import Assetstore
from girder.models.token import Token
from girder.models.folder import Folder
from girder.models.user import User
from girder.plugins.jobs.constants import REST_CREATE_JOB_TOKEN_SCOPE
from girder.plugins.jobs.models.job import Job
from gwvolman.tasks import import_tale, build_tale_image

from girder.plugins.jobs.constants import JobStatus
Expand Down Expand Up @@ -179,8 +182,8 @@ def deleteTale(self, tale, progress):
'of the query string. The file will be stored in a temporary '
'space. However, it is not currently being processed in any '
'way.')
.param('imageId', "The ID of the tale's image.", required=True)
.param('url', 'External dataset identifier.', required=True)
.param('imageId', "The ID of the tale's image.", required=False)
.param('url', 'External dataset identifier.', required=False)
.param('spawn', 'If false, create only Tale object without a corresponding '
'Instance.',
default=True, required=False, dataType='boolean')
Expand All @@ -193,35 +196,82 @@ def deleteTale(self, tale, progress):
)
def createTaleFromDataset(self, imageId, url, spawn, lookupKwargs, taleKwargs):
user = self.getCurrentUser()
image = imageModel().load(imageId, user=user, level=AccessType.READ,
exc=True)
token = self.getCurrentToken()
Token().addScope(token, scope=REST_CREATE_JOB_TOKEN_SCOPE)

if cherrypy.request.body.length > 0:
with tempfile.NamedTemporaryFile() as fp:
# TODO: Move assetstore type to wholetale.
assetstore = next((_ for _ in Assetstore().list() if _['type'] == 101), None)
if not assetstore:
raise RestException("No workspace assetstore...")
adapter = assetstore_utilities.getAssetstoreAdapter(assetstore)

with tempfile.NamedTemporaryFile(dir=adapter.tempDir) as fp:
for chunk in iterBody(2 * 1024 ** 3):
fp.write(chunk)
fp.seek(0)
# TODO: check if zip, check if Tale
# shortcircut here either by running a separate job
# or modifying import_tale to grab a file

try:
lookupKwargs['dataId'] = [url]
except TypeError:
lookupKwargs = dict(dataId=[url])

try:
taleKwargs['imageId'] = str(image['_id'])
except TypeError:
taleKwargs = dict(imageId=str(image['_id']))

taleTask = import_tale.delay(
lookupKwargs, taleKwargs, spawn=spawn,
girder_client_token=str(token['_id'])
)
return taleTask.job
if not zipfile.is_zipfile(fp):
raise RestException("Provided file is not a zipfile")

z = zipfile.ZipFile(fp)
manifest_file = next(
(_ for _ in z.namelist() if _.endswith('manifest.json')),
None
)
if not manifest_file:
raise RestException("Provided file doesn't contain a Tale")

try:
manifest = json.loads(z.read(manifest_file).decode())
# TODO: is there a better check?
manifest['@id'].startswith('https://data.wholetale.org')
except Exception as e:
raise RestException(
"Couldn't read manifest.json or not a Tale: {}".format(str(e))
)

# Extract files to tmp on workspace assetstore
temp_dir = tempfile.mkdtemp(dir=adapter.tempDir)
# In theory malicious content like: abs path for a member, or relative path with
# ../.. etc., is taken care of by zipfile.extractall, but in the end we're still
# unzipping an untrusted content. What could possibly go wrong...?
z.extractall(path=temp_dir)

job = Job().createLocalJob(
title='Import Tale from zip', user=user,
type='wholetale.import_tale', public=False, async=False,
module='girder.plugins.wholetale.tasks.import_tale',
args=(temp_dir, manifest_file),
kwargs={'user': user, 'token': token}
)
Job().scheduleJob(job)
return job
else:
if not (imageId or url):
msg = (
"You need to provide either a zipfile with an exported Tale or "
" both 'imageId' and 'url' parameters."
)
raise RestException(msg)

image = imageModel().load(imageId, user=user, level=AccessType.READ,
exc=True)

try:
lookupKwargs['dataId'] = [url]
except TypeError:
lookupKwargs = dict(dataId=[url])

try:
taleKwargs['imageId'] = str(image['_id'])
except TypeError:
taleKwargs = dict(imageId=str(image['_id']))

taleTask = import_tale.delay(
lookupKwargs, taleKwargs, spawn=spawn,
girder_client_token=str(token['_id'])
)
return taleTask.job

@access.user
@autoDescribeRoute(
Expand Down
Empty file added server/tasks/__init__.py
Empty file.
137 changes: 137 additions & 0 deletions server/tasks/import_tale.py
@@ -0,0 +1,137 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import json
import os
import pathlib
import sys
import traceback
from webdavfs.webdavfs import WebDAVFS
from fs.osfs import OSFS
from fs.copy import copy_fs
from girder.models.file import File
from girder.utility import config
from girder.plugins.jobs.constants import JobStatus
from girder.plugins.jobs.models.job import Job

from ..constants import CATALOG_NAME
from ..lib import pids_to_entities, register_dataMap
from ..lib.license import WholeTaleLicense
from ..lib.dataone import DataONELocations # TODO: get rid of it
from ..models.image import Image
from ..models.tale import Tale
from ..utils import getOrCreateRootFolder


def run(job):
jobModel = Job()
jobModel.updateJob(job, status=JobStatus.RUNNING)

tale_dir, manifest_file = job["args"]
user = job["kwargs"]["user"]
token = job["kwargs"]["token"]

try:
os.chdir(tale_dir)
manifest = json.load(open(manifest_file, 'r'))

# Load the environment description
env_file = os.path.join(os.path.dirname(manifest_file), "environment.json")
environment = json.load(open(env_file, 'r'))

# 1. Register data
dataIds = [_['identifier'] for _ in manifest["Datasets"]]
if dataIds:
jobModel.updateJob(job, status=JobStatus.RUNNING, log="Registering external data")
dataMap = pids_to_entities(
dataIds, user=user, base_url=DataONELocations.prod_cn, lookup=True
) # DataONE shouldn't be here
register_dataMap(
dataMap,
getOrCreateRootFolder(CATALOG_NAME),
'folder',
user=user,
base_url=DataONELocations.prod_cn,
)

# 2. Construct the dataSet
dataSet = []
for obj in manifest['aggregates']:
if 'bundledAs' not in obj:
continue
uri = obj['uri']
fobj = File().findOne({'linkUrl': uri}) # TODO: That's expensive, use something else
if fobj:
dataSet.append({
'itemId': fobj['itemId'],
'_modelType': 'item',
'mountPath': obj['bundledAs']['filename']
})
# TODO: handle folders

# 3. Create a Tale
jobModel.updateJob(job, status=JobStatus.RUNNING, log="Creating a Tale object")
image = Image().findOne(
{"name": environment["name"]}
) # TODO: create if necessary, for now assume we have it.
image = Image().filter(image, user)
icon = image.get(
"icon",
(
"https://raw.githubusercontent.com/"
"whole-tale/dashboard/master/public/"
"images/whole_tale_logo.png"
),
)
licenseSPDX = next(
(
_["schema:license"]
for _ in manifest["aggregates"]
if "schema:license" in _
),
WholeTaleLicense.default_spdx(),
)
authors = " ".join((user["firstName"], user["lastName"]))

tale = Tale().createTale(
image,
dataSet,
creator=user,
save=True,
title=manifest["schema:name"],
description=manifest["schema:description"],
public=False,
config={},
icon=icon,
illustration=manifest["schema:image"],
authors=authors,
category=manifest["schema:category"],
licenseSPDX=licenseSPDX,
)

# 4. Copy data to the workspace using WebDAVFS (if it exists)
jobModel.updateJob(
job, status=JobStatus.RUNNING, log="Copying files to workspace"
)
orig_tale_id = pathlib.Path(manifest_file).parts[0]
for workdir in ("workspace", "data/workspace", None):
if workdir:
workdir = os.path.join(orig_tale_id, workdir)
if os.path.isdir(workdir):
break

if workdir:
password = "token:{_id}".format(**token)
root = "/tales/{_id}".format(**tale)
url = "http://localhost:{}".format(config.getConfig()['server.socket_port'])
with WebDAVFS(
url, login=user["login"], password=password, root=root
) as webdav_handle:
copy_fs(OSFS(workdir), webdav_handle)

jobModel.updateJob(job, status=JobStatus.SUCCESS, log="Tale created")
except Exception:
t, val, tb = sys.exc_info()
log = "%s: %s\n%s" % (t.__name__, repr(val), traceback.extract_tb(tb))
jobModel.updateJob(job, status=JobStatus.ERROR, log=log)
raise
3 changes: 2 additions & 1 deletion setup.cfg
Expand Up @@ -54,4 +54,5 @@ exclude: girder/external/*
# N803 - Argument name should be lowercase.
# N806 - Variable in function should be lowercase.
# N812 - Lowercase imported as non lowercase.
ignore: D100,D101,D102,D103,D104,D105,D107,D200,D201,D202,D203,D204,D205,D400,D401,D402,E123,E226,E241,N802,N803,N806,N812
# W606 - 'async' and 'await' are reserved keywords starting with Python 3.7
ignore: D100,D101,D102,D103,D104,D105,D107,D200,D201,D202,D203,D204,D205,D400,D401,D402,E123,E226,E241,N802,N803,N806,N812,W606

0 comments on commit 0a5b4e8

Please sign in to comment.