Permalink
Find file
Fetching contributors…
Cannot retrieve contributors at this time
executable file 391 lines (355 sloc) 14.2 KB
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from os import listdir, path, remove, rename
from sys import argv, exit, stderr, stdout
import errno
import gobject, pygst
pygst.require("0.10")
import gst
import logging
import progressbar
import mutagen.oggtheora
import pprint
import subprocess
from helpers import autovividict, filename_from_url, media, make_datestring
from model import session, setup_all, create_all, set_source, \
Article, Category, Journal, SupplementaryMaterial
try:
action = argv[1]
target = argv[2]
except IndexError:
stderr.write("""
oa-cache – Open Access Media Importer local operations
usage: oa-cache browse-database [source] |
oa-cache clear-media [source] |
oa-cache clear-database [source] |
oa-cache convert-media [source] |
oa-cache find-media [source] |
oa-cache forget-converted [source] |
oa-cache forget-downloaded [source] |
oa-cache forget-uploaded [source] |
oa-cache print-database-path [source]
oa-cache stats [source]
""")
exit(1)
try:
assert(action in ['browse-database', 'clear-media', 'clear-database', \
'convert-media', 'find-media', 'forget-converted', \
'forget-downloaded', 'forget-uploaded', 'print-database-path', \
'stats'])
except AssertionError: # invalid action
stderr.write('Unknown action “%s”.\n' % action)
exit(2)
try:
exec "from sources import %s as source_module" % target
except ImportError: # invalid source
stderr.write("Unknown source “%s”.\n" % target)
exit(3)
set_source(target)
setup_all(True)
from helpers import config
if action == 'browse-database':
filename = config.database_path(target)
try:
subprocess.call(['sqlitebrowser', filename])
except OSError:
stderr.write('Unable to start sqlitebrowser <http://sqlitebrowser.sourceforge.net/>.\n')
exit(4)
if action == 'clear-media':
media_raw_directory = config.get_media_refined_source_path(target)
listing = listdir(media_raw_directory)
metadata_refined_directory = config.get_metadata_refined_source_path(target)
download_cache_path = path.join(metadata_refined_directory, 'download_cache')
remove(download_cache_path)
for filename in listing:
media_path = path.join(media_raw_directory, filename)
stderr.write("Removing “%s” … " % media_path)
remove(media_path)
stderr.write("done.\n")
if action == 'clear-database':
filename = config.database_path(target)
stderr.write("Removing “%s” … " % filename)
try:
remove(filename)
stderr.write("done.\n")
except OSError, e:
stderr.write('\n%s\n' % str(e))
if action == 'convert-media':
materials = SupplementaryMaterial.query.filter_by(
downloaded=True,
converted=False
).all()
for material in materials:
media_refined_directory = config.get_media_refined_source_path(target)
media_raw_directory = config.get_media_raw_source_path(target)
temporary_media_path = path.join(media_refined_directory, 'current.ogg')
filename = filename_from_url(material.url)
media_raw_path = path.join(media_raw_directory, filename)
media_refined_path = path.join(media_refined_directory, filename + '.ogg')
if material.converting:
stderr.write("Skipping conversion of “%s”, earlier attempt failed.\n" % \
media_raw_path.encode('utf-8'))
continue
if path.isfile(media_refined_path):
stderr.write("Skipping conversion of “%s”, exists at “%s”.\n" %
(
media_raw_path.encode('utf-8'),
media_refined_path.encode('utf-8')
)
)
material.converted = True
session.commit()
continue
material.converting = True
session.commit()
stderr.write("Converting “%s”, saving into “%s” … " % (
media_raw_path.encode('utf-8'),
media_refined_path.encode('utf-8')
)
)
#loop = gobject.MainLoop()
m = media.Media(media_raw_path)
try:
m.find_streams()
m.convert(temporary_media_path)
except RuntimeError, e:
logging.error("%s: Skipping conversion of “%s”.", \
e, media_raw_path.encode('utf-8'))
continue
try:
f = mutagen.oggtheora.OggTheora(temporary_media_path)
for key, value in \
[
('TITLE', material.title),
('ALBUM', material.article.title),
('ARTIST', material.article.contrib_authors),
('COPYRIGHTS', material.article.copyright_holder),
('LICENSE', material.article.license_url),
('DESCRIPTION', material.caption),
('DATE', make_datestring(
material.article.year,
material.article.month,
material.article.day
))
]:
if value is not None:
f[key] = value
else:
logging.warning('Missing metadata: %s.', key)
f.save()
except mutagen.oggtheora.OggTheoraHeaderError:
pass # Most probably an encoding failure.
rename(temporary_media_path, media_refined_path)
stderr.write("done.\n")
material.converting = False
material.converted = True
session.commit()
if action == 'forget-converted':
materials = SupplementaryMaterial.query.filter_by(
converted=True
).all()
stderr.write("Forgetting conversion of %s materials … " % len(materials))
for material in materials:
material.converted = False
session.commit()
stderr.write("done.\n")
if action == 'forget-downloaded':
materials = SupplementaryMaterial.query.filter_by(
downloaded=True
).all()
stderr.write("Forgetting download of %s materials … " % len(materials))
for material in materials:
material.downloaded = False
session.commit()
stderr.write("done.\n")
if action == 'forget-uploaded':
materials = SupplementaryMaterial.query.filter_by(
uploaded=True
).all()
stderr.write("Forgetting upload of %s materials … " % len(materials))
for material in materials:
material.uploaded = False
session.commit()
stderr.write("done.\n")
if action == 'find-media':
skip = [article.name for article in Article.query.all()]
if len(skip) > 0:
stderr.write('Skipping %s records … \n' % len(skip))
source_path = config.get_metadata_raw_source_path(target)
for result in source_module.list_articles(
source_path,
supplementary_materials=True,
skip=skip
):
try:
journal = Journal.get_by(title=result['journal-title'])
if not journal:
journal = Journal(
title = result['journal-title']
)
article = Article.get_by(
title=result['article-title'],
contrib_authors=result['article-contrib-authors']
)
if not article:
# if there is a whitelist, skip non-whitelisted content
doi = result['doi']
try:
doi_prefix = doi.split('/')[0]
except AttributeError:
stderr.write(
"Skipping Article “%s”, as it has no DOI.\n" % \
result['name']
)
continue
if config.whitelist_doi and \
doi_prefix not in config.whitelist_doi:
stderr.write(
"Skipping DOI %s, prefix %s is not in whitelist.\n" % \
(doi, doi_prefix)
)
continue
article = Article(
name=result['name'],
doi=doi,
title=result['article-title'],
contrib_authors=result['article-contrib-authors'],
abstract=result['article-abstract'],
year=result['article-year'],
month=result['article-month'],
day=result['article-day'],
url=result['article-url'],
license_url=result['article-license-url'],
license_text=result['article-license-text'],
copyright_statement=result['article-copyright-statement'],
copyright_holder=result['article-copyright-holder'],
journal=journal
)
for category_name in result['article-categories']:
category = Category.get_by(name=category_name)
if not category:
category = Category(name=category_name)
category.articles.append(article)
materials = result['supplementary-materials']
if materials:
stderr.write(
'%s”:\n' % result['article-title'].encode('utf-8')
)
mimetypes = {}
for material in materials:
mimetype = material['mimetype'] + '/' + material['mime-subtype']
try:
mimetypes[mimetype] += 1
except KeyError:
mimetypes[mimetype] = 1
supplementary_material = SupplementaryMaterial.get_by(url=material['url'])
if not supplementary_material:
supplementary_material=SupplementaryMaterial(
label=material['label'],
title=material['title'],
caption=material['caption'],
mimetype=material['mimetype'],
mime_subtype=material['mime-subtype'],
url=material['url'],
article=article
)
for mimetype in mimetypes.keys():
stderr.write(
'\t%s × %s\n' % (
mimetypes[mimetype],
mimetype
)
)
stderr.write('\n')
session.commit()
except KeyboardInterrupt:
stderr.write('Saving database …\n')
session.commit()
exit(0)
if action == "print-database-path":
filename = config.database_path(target)
stdout.write(filename)
if action == "stats":
stderr.write('Counting supplementary materials … ')
materials = SupplementaryMaterial.query.all()
stderr.write(str(len(materials)) + ' supplementary materials found.\n')
p = progressbar.ProgressBar(maxval=len(materials)).start()
completed = 0
licenses = {
'free': autovividict(),
'non-free': autovividict(),
}
licensing_publishers = {
'url': autovividict(),
'url-from-text': autovividict(),
'text': autovividict(),
'none': autovividict()
}
mimetypes = {
'free': autovividict(),
'non-free': autovividict(),
'misreported': autovividict()
}
mimetypes_publishers = {
'correct': autovividict(),
'incorrect': autovividict(),
'unknown': autovividict()
}
mimetypes_prefix_publishers = {
'free': autovividict(),
'non-free': autovividict()
}
for material in materials:
license_url = material.article.license_url
license_text = material.article.license_text
copyright_statement = material.article.copyright_statement
doi = material.article.doi
if doi is not None:
doi_prefix = doi.split('/')[0]
else:
doi_prefix = u'None'
mimetype = material.mimetype
mimetype_composite = mimetype + '/' + material.mime_subtype
try:
mimetype_composite_reported = material.mimetype_reported + '/' + material.mime_subtype_reported
if mimetype_composite == 'application/msword':
mimetypes_publishers['unknown'][doi_prefix] += 1
elif mimetype_composite != mimetype_composite_reported:
mimetypes['misreported'][mimetype_composite][mimetype_composite_reported] += 1
mimetypes_publishers['incorrect'][doi_prefix] += 1
else: # mimetype is correct
mimetypes_publishers['correct'][doi_prefix] += 1
except TypeError: # oa-get update-mimetypes was not run
mimetypes_publishers['unknown'][doi_prefix] += 1
if license_url in config.free_license_urls:
licenses['free'][license_url] += 1
mimetypes['free'][mimetype_composite] += 1
mimetypes_prefix_publishers['free'][mimetype][doi_prefix] += 1
else:
licenses['non-free'][license_url] += 1
mimetypes['non-free'][mimetype_composite] += 1
mimetypes_prefix_publishers['non-free'][mimetype][doi_prefix] += 1
if license_url is not None:
if license_text is not None or \
copyright_statement is not None:
licensing_publishers['url-from-text'][doi_prefix] += 1
else: # URL was given, no text lookup necessary
licensing_publishers['url'][doi_prefix] += 1
else:
if license_text is not None or \
copyright_statement is not None:
licensing_publishers['text'][doi_prefix] += 1
else: # no licensing information at all
licensing_publishers['none'][doi_prefix] += 1
completed += 1
p.update(completed)
stderr.write('\n')
pp = pprint.PrettyPrinter(indent=4)
output = pp.pformat({
'licenses': licenses,
'licensing_publishers': licensing_publishers,
'mimetypes': mimetypes,
'mimetypes_publishers': mimetypes_publishers,
'mimetypes_prefix_publishers': mimetypes_prefix_publishers
})
stdout.write(output + '\n')