Skip to content

Commit

Permalink
adds the following: failed scalability enhancement for webpy.py (code…
Browse files Browse the repository at this point in the history
… retained but unused), fix list_collections on default repository to provide valid atom feed, fix content type returned with service document

git-svn-id: http://sword-app.svn.sourceforge.net/svnroot/sword-app/sss/branches/sss-2@471 2bf6ea0f-123d-0410-b71a-f1a21eb24612
  • Loading branch information
richard-jones committed Mar 15, 2012
1 parent 76ea312 commit be4a5f3
Show file tree
Hide file tree
Showing 4 changed files with 255 additions and 4 deletions.
6 changes: 6 additions & 0 deletions sss/config.py
Expand Up @@ -51,6 +51,12 @@
"store_dir" : "./store/", "store_dir" : "./store/",
# If you are using Apache you should set the store directory in full # If you are using Apache you should set the store directory in full
# The directory where incoming content will be temporarily stored
"tmp_dir" : "./tmp/",
# The chunk size used to copy file streams into and out of the temp directory
"copy_chunk_size" : 8096,
# explicitly set the sword version, so if you're testing validation of # explicitly set the sword version, so if you're testing validation of
# service documents you can "break" it. # service documents you can "break" it.
"sword_version" : "2.0", "sword_version" : "2.0",
Expand Down
3 changes: 2 additions & 1 deletion sss/pylons_sword_controller.py
Expand Up @@ -401,7 +401,8 @@ def _GET_service_document(self, path=None):
# if we get here authentication was successful and we carry on (we don't care who authenticated) # if we get here authentication was successful and we carry on (we don't care who authenticated)
ss = SwordServer(config, auth) ss = SwordServer(config, auth)
sd = ss.service_document(path) sd = ss.service_document(path)
response.content_type = "text/xml" response.content_type = "application/atomsvc+xml"
#response.content_type = "text/xml"
ssslog.info("Returning " + response.status + " from request on " + inspect.stack()[0][3]) ssslog.info("Returning " + response.status + " from request on " + inspect.stack()[0][3])
return sd return sd


Expand Down
25 changes: 25 additions & 0 deletions sss/repository.py
Expand Up @@ -254,6 +254,19 @@ def list_collection(self, id):


# create an empty feed element for the collection # create an empty feed element for the collection
feed = etree.Element(self.ns.ATOM + "feed", nsmap=self.cmap) feed = etree.Element(self.ns.ATOM + "feed", nsmap=self.cmap)

title = etree.SubElement(feed, self.ns.ATOM + "title")
title.text = "Title: " + id
myid = etree.SubElement(feed, self.ns.ATOM + "id")
myid.text = self.um.col_uri(id)
atomlink = etree.SubElement(feed, self.ns.ATOM + "link")
atomlink.set('rel', 'self')
atomlink.set('href', self.um.col_uri(id))
updated = etree.SubElement(feed, self.ns.ATOM + "updated")
updated.text = datetime.now().strftime("%Y-%m-%dT%H:%M:%SZ")
author = etree.SubElement(feed, self.ns.ATOM + "author")
authorname = etree.SubElement(author, self.ns.ATOM + "name")
authorname.text = "Simple Sword Server"


# if the collection path does not exist, then return the empty feed # if the collection path does not exist, then return the empty feed
cpath = os.path.join(self.configuration.store_dir, str(id)) cpath = os.path.join(self.configuration.store_dir, str(id))
Expand All @@ -264,9 +277,21 @@ def list_collection(self, id):
parts = os.listdir(cpath) parts = os.listdir(cpath)
for part in parts: for part in parts:
entry = etree.SubElement(feed, self.ns.ATOM + "entry") entry = etree.SubElement(feed, self.ns.ATOM + "entry")
entrytitle = etree.SubElement(entry, self.ns.ATOM + "title")
entrytitle.text = "Title: " + part
entryid = etree.SubElement(entry, self.ns.ATOM + "id")
entryid.text = self.um.edit_uri(id, part)
entryupdated = etree.SubElement(entry, self.ns.ATOM + "updated")
entryupdated.text = datetime.now().strftime("%Y-%m-%dT%H:%M:%SZ")
summary = etree.SubElement(entry, self.ns.ATOM + "summary")
summary.text = "Summary for " + part
link = etree.SubElement(entry, self.ns.ATOM + "link") link = etree.SubElement(entry, self.ns.ATOM + "link")
link.set("rel", "edit") link.set("rel", "edit")
link.set("href", self.um.edit_uri(id, part)) link.set("href", self.um.edit_uri(id, part))
link2 = etree.SubElement(entry, self.ns.ATOM + "link")
link2.set("rel", "alternate")
link2.set("type", "text/html")
link2.set("href", self.um.edit_uri(id, part))


# pretty print and return # pretty print and return
return etree.tostring(feed, pretty_print=True) return etree.tostring(feed, pretty_print=True)
Expand Down
225 changes: 222 additions & 3 deletions sss/webpy.py
@@ -1,4 +1,4 @@
import web, re, base64, urllib, uuid import web, re, base64, urllib, uuid, os
from web.wsgiserver import CherryPyWSGIServer from web.wsgiserver import CherryPyWSGIServer
from core import Auth, SwordError, AuthException, DepositRequest, DeleteRequest from core import Auth, SwordError, AuthException, DepositRequest, DeleteRequest
from negotiator import ContentNegotiator, AcceptParameters, ContentType from negotiator import ContentNegotiator, AcceptParameters, ContentType
Expand Down Expand Up @@ -73,6 +73,32 @@
# Define a set of handlers for the various URLs defined above to be used by web.py # Define a set of handlers for the various URLs defined above to be used by web.py


class SwordHttpHandler(object): class SwordHttpHandler(object):

def read_to_tmp(self, web):
# the incoming body content is in wsgi.input, which is a file-like object
# but which only supports "read", not useful extras like "seek", so we
# stream this into a temp file, and return a handle to that instead
size = web.utils.intget(web.ctx.env.get('CONTENT_LENGTH'), 0)
wsgi_input = web.ctx.env['wsgi.input']
if not os.path.exists(config.tmp_dir):
os.mkdir(config.tmp_dir)
fn = os.path.join(config.tmp_dir, str(uuid.uuid4()))
ssslog.info("Reading incoming content of size " + str(size) + "to temp file " + fn)
if wsgi_input is not None:
with open(fn, "wb") as outfile:
while size > 0:
chunk_size = config.copy_chunk_size if size > config.copy_chunk_size else size
chunk = wsgi_input.rfile.read(chunk_size)
if chunk is None or chunk == "":
break
outfile.write(chunk)
outfile.flush()
os.fsync(outfile.fileno())
size -= chunk_size
with open(fn, "r") as tmpfile:
return tmpfile
return None

def http_basic_authenticate(self, web): def http_basic_authenticate(self, web):
# extract the appropriate HTTP headers # extract the appropriate HTTP headers
auth_header = web.ctx.env.get('HTTP_AUTHORIZATION') auth_header = web.ctx.env.get('HTTP_AUTHORIZATION')
Expand Down Expand Up @@ -180,7 +206,118 @@ def validate_deposit_request(self, web, entry_section=None, binary_section=None,


except ValidationException as e: except ValidationException as e:
raise SwordError(error_uri=Errors.bad_request, msg=e.message) raise SwordError(error_uri=Errors.bad_request, msg=e.message)

''' FIXME: this was an experimental version which was supposed to scale
def validate_deposit_request(self, web, file_handle, entry_section=None, binary_section=None, multipart_section=None, empty_section=None, allow_multipart=True, allow_empty=False):
h = HttpHeaders()
# map the headers to standard http
mapped_headers = self._map_webpy_headers(web.ctx.environ)
ssslog.debug("Validating on header dictionary: " + str(mapped_headers))
# run the validation
try:
# there must be both an "atom" and "payload" input or data in web.data()
webin = web.input()
if len(webin) != 2 and len(webin) > 0:
raise ValidationException("Multipart request does not contain exactly 2 parts")
if len(webin) >= 2 and not webin.has_key("atom") and not webin.has_key("payload"):
raise ValidationException("Multipart request must contain Content-Dispositions with names 'atom' and 'payload'")
if len(webin) > 0 and not allow_multipart:
raise ValidationException("Multipart request not permitted in this context")
# if we get to here then we have a valid multipart or no multipart
is_multipart = False
is_empty = False
if len(webin) != 2:
if file_handle is not None:
file_handle.seek(0, 0)
byte = file_handle.read(1)
if byte == "" and allow_empty:
# the body is empty
ssslog.debug("first byte of deposit request is the empty string")
is_empty = True
else:
ssslog.debug("first byte of deposit request is \"" + byte + "\" ... not an empty request")
file_handle.seek(0, 0)
else:
is_empty = True
"""
# NOTE: the wsgi_input is a SizeCheckWrapper object which imperfectly
# wraps a file object. We therefore have to access the "rfile"
# property to interact with the file itself (although that is dangerous,
# as the file will be WSGI implementation specific)
wsgi_input = web.ctx.env['wsgi.input']
# if there is a wsgi input object with seek enabled, it may have already
# been read by webpy, so we seek back to the start
#if hasattr(wsgi_input, "seek"):
if wsgi_input is not None:
# in empty requests, the wsgi input object doesn't have a seek() method
# so we have to check for it
# wsgi_input.rfile.seek(0, 0)
# read just one byte out of the file, to see if there's any content
# if there is not, byte will be the empty string
byte = wsgi_input.rfile.read(1)
if byte == "" and allow_empty:
# the body is empty
ssslog.debug("first byte of deposit request is the empty string")
is_empty = True
else:
ssslog.debug("first byte of deposit request is \"" + byte + "\" ... not an empty request")
# wsgi_input.rfile.seek(0, 0)
elif wsgi_input is None:
is_empty = True
"""
# validate whether we allow an empty deposit
if is_empty and not allow_empty:
raise ValidationException("No content sent to the server")
elif is_empty and allow_empty:
ssslog.info("Validating an empty deposit (could be a control operation)")
else:
ssslog.info("Validating a multipart deposit")
is_multipart = True
"""
if wsgi_input is None or wsgi_input.read().strip() == "": # FIXME: this IS NOT safe to scale
if allow_empty:
ssslog.info("Validating an empty deposit (could be a control operation)")
is_empty = True
else:
raise ValidationException("No content sent to the server")
"""
"""
if len(webin) != 2: # if it is not multipart
if web.data() is None or web.data().strip() == "": # FIXME: this does not look safe to scale
if allow_empty:
ssslog.info("Validating an empty deposit (could be a control operation)")
is_empty = True
else:
raise ValidationException("No content sent to the server")
"""
is_entry = False
content_type = mapped_headers.get("CONTENT-TYPE")
if content_type is not None and content_type.startswith("application/atom+xml"):
ssslog.info("Validating an atom-only deposit")
is_entry = True
if not is_entry and not is_multipart and not is_empty:
ssslog.info("Validating a binary deposit")
section = entry_section if is_entry else multipart_section if is_multipart else empty_section if is_empty else binary_section
# now validate the http headers
h.validate(mapped_headers, section)
except ValidationException as e:
raise SwordError(error_uri=Errors.bad_request, msg=e.message)
'''

def get_deposit(self, web, auth=None, atom_only=False): def get_deposit(self, web, auth=None, atom_only=False):
# FIXME: this reads files into memory, and therefore does not scale # FIXME: this reads files into memory, and therefore does not scale
# FIXME: this does not deal with the Media Part headers on a multipart deposit # FIXME: this does not deal with the Media Part headers on a multipart deposit
Expand Down Expand Up @@ -245,6 +382,79 @@ def get_deposit(self, web, auth=None, atom_only=False):
# now just attach the authentication data and return # now just attach the authentication data and return
d.auth = auth d.auth = auth
return d return d

''' FIXME: this was an experimental version which was supposed to scale
def get_deposit(self, web, file_handle, auth=None, atom_only=False):
# FIXME: this reads files into memory, and therefore does not scale
# FIXME: this does not deal with the Media Part headers on a multipart deposit
"""
Take a web.py web object and extract from it the parameters and content required for a SWORD deposit. This
includes determining whether this is an Atom Multipart request or not, and extracting the atom/payload where
appropriate. It also includes extracting the HTTP headers which are relevant to deposit, and for those not
supplied providing their defaults in the returned DepositRequest object
"""
d = DepositRequest()
# map the webpy headers to something more standard
mapped_headers = self._map_webpy_headers(web.ctx.environ)
# get the headers that have been provided. Any headers which have not been provided will
# will have default values applied
h = HttpHeaders()
d.set_from_headers(h.get_sword_headers(mapped_headers))
if d.content_type.startswith("application/atom+xml"):
atom_only=True
empty_request = False
if d.content_length == 0:
ssslog.info("Received empty deposit request")
empty_request = True
if d.content_length > config.max_upload_size:
raise SwordError(error_uri=Errors.max_upload_size_exceeded,
msg="Max upload size is " + str(config.max_upload_size) +
"; incoming content length was " + str(d.content_length))
# find out if this is a multipart or not
is_multipart = False
# FIXME: these headers aren't populated yet, because the webpy api doesn't
# appear to have a mechanism to retrieve them. urgh.
entry_part_headers = {}
media_part_headers = {}
webin = web.input()
if len(webin) == 2:
ssslog.info("Received multipart deposit request")
d.atom = webin['atom']
# FIXME: this reads the payload into memory, we need to sort that out
# read the zip file from the base64 encoded string
d.content = base64.decodestring(webin['payload'])
is_multipart = True
elif not empty_request:
# if this wasn't a multipart, and isn't an empty request, then the data is in web.data(). This could be a binary deposit or
# an atom entry deposit - reply on the passed/determined argument to determine which
if atom_only:
ssslog.info("Received Entry deposit request")
d.atom = file_handle.read() # read from the tmp file
else:
ssslog.info("Received Binary deposit request")
d.content = file_handle.read() # read from the tmp file
"""
wsgi_input = web.ctx.env['wsgi.input']
if wsgi_input is not None:
wsgi_input.rfile.seek(0, 0)
d.content = wsgi_input.rfile.read()
"""
if is_multipart:
d.filename = h.extract_filename(media_part_headers)
else:
d.filename = h.extract_filename(mapped_headers)
# now just attach the authentication data and return
d.auth = auth
return d
'''


def get_delete(self, web, auth=None): def get_delete(self, web, auth=None):
""" """
Expand Down Expand Up @@ -284,7 +494,8 @@ def GET(self, sub_path=None):
# if we get here authentication was successful and we carry on (we don't care who authenticated) # if we get here authentication was successful and we carry on (we don't care who authenticated)
ss = SwordServer(config, auth) ss = SwordServer(config, auth)
sd = ss.service_document(sub_path) sd = ss.service_document(sub_path)
web.header("Content-Type", "text/xml") web.header("Content-Type", "application/atomsvc+xml")
# web.header("Content-Type", "text/xml")
return sd return sd


class Collection(SwordHttpHandler): class Collection(SwordHttpHandler):
Expand All @@ -311,7 +522,7 @@ def GET(self, collection):
cl = ss.list_collection(collection) cl = ss.list_collection(collection)
web.header("Content-Type", "text/xml") web.header("Content-Type", "text/xml")
return cl return cl

def POST(self, collection): def POST(self, collection):
""" """
POST either an Atom Multipart request, or a simple package into the specified collection POST either an Atom Multipart request, or a simple package into the specified collection
Expand All @@ -325,6 +536,14 @@ def POST(self, collection):
# authenticate # authenticate
auth = self.http_basic_authenticate(web) auth = self.http_basic_authenticate(web)


# FIXME: this was supposed to help us with our scalability, but
# unfortunately the way that web.py works, it is not possible to
# read the incoming file to disk and still use the other functions
# on the web object (e.g. input())

# store any body content in a temp file
#fh = self.read_to_tmp(web)

# check the validity of the request # check the validity of the request
self.validate_deposit_request(web, "6.3.3", "6.3.1", "6.3.2") self.validate_deposit_request(web, "6.3.3", "6.3.1", "6.3.2")


Expand Down

0 comments on commit be4a5f3

Please sign in to comment.