/
ingesters_disseminators.py
247 lines (197 loc) · 9.63 KB
/
ingesters_disseminators.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
from zipfile import ZipFile
from lxml import etree
from spec import Namespaces
from sss_logging import logging
ssslog = logging.getLogger(__name__)
class DisseminationPackager(object):
def __init__(self, dao, uri_manager):
pass
"""
Interface for all classes wishing to provide dissemination packaging services to the SSS
"""
def package(self, collection, id):
"""
Package up all the content in the specified container. This method must be implemented by the extender. The
method should create a package in the store directory, and then return to the caller the path to that file
so that it can be served back to the client
"""
pass
def get_uri(self):
return "http://purl.org/net/sword/package/SimpleZip"
class IngestPackager(object):
def __init__(self, dao):
pass
def ingest(self, collection, id, filename, metadata_relevant):
"""
The package with the supplied filename has been placed in the identified container. This should be inspected
and unpackaged. Implementations should note that there is optionally an atom document in the container which
may need to be inspected, and this can be retrieved from DAO.get_atom_content(). If the metadata_relevant
argument is False, implementations should not change the already extracted metadata in the container
"""
return []
class DefaultDisseminator(DisseminationPackager):
"""
Basic default packager, this just zips up everything except the SSS specific files in the container and stores
them in a file called sword-default-package.zip.
"""
def __init__(self, dao, uri_manager):
self.dao = dao
def package(self, collection, id):
""" package up the content """
# get a list of the relevant content files
files = self.dao.list_content(collection, id, exclude=["sword-default-package.zip"])
# create a zip file with all the original zip files in it
zpath = self.dao.get_store_path(collection, id, "sword-default-package.zip")
z = ZipFile(zpath, "w")
for file in files:
z.write(self.dao.get_store_path(collection, id, file), file)
z.close()
# return the path to the package to the caller
return zpath
class FeedDisseminator(DisseminationPackager):
def __init__(self, dao, uri_manager):
self.dao = dao
self.ns = Namespaces()
self.um = uri_manager
self.nsmap = {None: self.ns.ATOM_NS}
def package(self, collection, id):
""" create a feed representation of the package """
# get a list of the relevant content files
files = self.dao.list_content(collection, id, exclude=["mediaresource.feed.xml"])
# create a feed object with all the files as entries
feed = etree.Element(self.ns.ATOM + "feed", nsmap=self.nsmap)
for file in files:
entry = etree.SubElement(feed, self.ns.ATOM + "entry")
em = etree.SubElement(entry, self.ns.ATOM + "link")
em.set("rel", "edit-media")
em.set("href", self.um.part_uri(collection, id, file))
edit = etree.SubElement(entry, self.ns.ATOM + "link")
edit.set("rel", "edit")
edit.set("href", self.um.part_uri(collection, id, file) + ".atom")
content = etree.SubElement(entry, self.ns.ATOM + "link")
content.set("type", "application/octet-stream") # FIXME: we're not storing content types, so we don't know
content.set("src", self.um.part_uri(collection, id, file))
fpath = self.dao.get_store_path(collection, id, "mediaresource.feed.xml")
f = open(fpath, "wb")
f.write(etree.tostring(feed, pretty_print=True))
f.close()
return fpath
def get_uri(self):
return None
class BinaryIngester(IngestPackager):
def __init__(self, dao):
pass
def ingest(self, collection, id, filename, metadata_relevant):
# does nothing, we don't try to unpack binary deposits
return []
class SimpleZipIngester(IngestPackager):
def __init__(self, dao):
self.dao = dao
self.ns = Namespaces()
def ingest(self, collection, id, filename, metadata_relevant=True):
# First, let's just extract all the contents of the zip
z = ZipFile(self.dao.get_store_path(collection, id, filename))
# keep track of the names of the files in the zip, as these will become
# our derived resources
derived_resources = z.namelist()
# FIXME: what we do here is intrinsically insecure, but SSS is not a
# production service, so we're not worrying about it!
path = self.dao.get_store_path(collection, id)
z.extractall(path)
# check for the atom document
atom = self.dao.get_atom_content(collection, id)
if atom is None:
# there's no metadata to extract so just leave it
return derived_resources
# if the metadata is not relevant, then we don't need to continue
if not metadata_relevant:
return derived_resources
metadata = {}
entry = etree.fromstring(atom)
# go through each element in the atom entry and just process the ones we care about
# explicitly retrieve the atom based metadata first
for element in entry.getchildren():
if element.tag == self.ns.ATOM + "title":
self.a_insert(metadata, "title", element.text.strip())
if element.tag == self.ns.ATOM + "updated":
self.a_insert(metadata, "date", element.text.strip())
if element.tag == self.ns.ATOM + "author":
authors = ""
for names in element.getchildren():
authors += names.text.strip() + " "
self.a_insert(metadata, "creator", authors.strip())
if element.tag == self.ns.ATOM + "summary":
self.a_insert(metadata, "abstract", element.text.strip())
# now go through and retrieve the dcterms from the entry
for element in entry.getchildren():
if not isinstance(element.tag, basestring):
continue
# we operate an additive policy with metadata. Duplicate
# keys are allowed, but duplicate key/value pairs are not.
if element.tag.startswith(self.ns.DC):
key = element.tag[len(self.ns.DC):]
val = element.text.strip()
self.a_insert(metadata, key, val)
self.dao.store_metadata(collection, id, metadata)
return derived_resources
def a_insert(self, d, key, value):
if d.has_key(key):
vs = d[key]
if value not in vs:
d[key].append(value)
else:
d[key] = [value]
class METSDSpaceIngester(IngestPackager):
def ingest(self, collection, id, filename, metadata_relevant):
# we don't need to implement this, it is just for example. it would unzip the file and import the metadata
# in the zip file
return []
class DefaultEntryIngester(object):
def __init__(self, dao):
self.dao = dao
self.ns = Namespaces()
def ingest(self, collection, id, atom, additive=False):
ssslog.debug("Ingesting Metadata; Additive? " + str(additive))
# store the atom
self.dao.store_atom(collection, id, atom)
# now extract/augment the metadata
metadata = {}
if additive:
# start with any existing metadata
metadata = self.dao.get_metadata(collection, id)
ssslog.debug("Existing Metadata (before new ingest): " + str(metadata))
ssslog.debug("Incoming atom: " + atom)
entry = etree.fromstring(atom)
# go through each element in the atom entry and just process the ones we care about
# explicitly retrieve the atom based metadata first
for element in entry.getchildren():
if element.tag == self.ns.ATOM + "title":
self.a_insert(metadata, "title", element.text.strip())
if element.tag == self.ns.ATOM + "updated":
self.a_insert(metadata, "date", element.text.strip())
if element.tag == self.ns.ATOM + "author":
authors = ""
for names in element.getchildren():
authors += names.text.strip() + " "
self.a_insert(metadata, "creator", authors.strip())
if element.tag == self.ns.ATOM + "summary":
self.a_insert(metadata, "abstract", element.text.strip())
# now go through and retrieve the dcterms from the entry
for element in entry.getchildren():
if not isinstance(element.tag, basestring):
continue
# we operate an additive policy with metadata. Duplicate
# keys are allowed, but duplicate key/value pairs are not.
if element.tag.startswith(self.ns.DC):
key = element.tag[len(self.ns.DC):]
val = element.text.strip()
self.a_insert(metadata, key, val)
ssslog.debug("Current Metadata (extracted + previously existing): " + str(metadata))
self.dao.store_metadata(collection, id, metadata)
def a_insert(self, d, key, value):
if d.has_key(key):
vs = d[key]
if value not in vs:
d[key].append(value)
else:
d[key] = [value]