Skip to content

Commit

Permalink
api fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
da1nerd committed May 5, 2018
1 parent 2c11a69 commit a76b9f7
Show file tree
Hide file tree
Showing 2 changed files with 51 additions and 33 deletions.
73 changes: 42 additions & 31 deletions libraries/lambda_handlers/ts_v2_catalog_handler.py
Expand Up @@ -20,14 +20,13 @@
from libraries.tools.file_utils import read_file, download_rc
from libraries.tools.legacy_utils import index_obs
from libraries.tools.url_utils import download_file, get_url, url_exists
from libraries.tools.ts_v2_utils import convert_rc_links, build_json_source_from_usx, make_legacy_date,\
from libraries.tools.ts_v2_utils import convert_rc_links, build_json_source_from_usx, make_legacy_date, \
max_modified_date, get_rc_type, build_usx, prep_data_upload, index_tn_rc

from libraries.lambda_handlers.instance_handler import InstanceHandler


class TsV2CatalogHandler(InstanceHandler):

cdn_root_path = 'v2/ts'
api_version = 'ts.2'

Expand All @@ -39,27 +38,27 @@ def __init__(self, event, context, logger, **kwargs):
self.cdn_url = self.retrieve(env_vars, 'cdn_url', 'Environment Vars').rstrip('/')
self.from_email = self.retrieve(env_vars, 'from_email', 'Environment Vars')
self.to_email = self.retrieve(env_vars, 'to_email', 'Environment Vars')
self.logger = logger # type: logging._loggerClass
self.logger = logger # type: logging._loggerClass
if 's3_handler' in kwargs:
self.cdn_handler = kwargs['s3_handler']
else:
self.cdn_handler = S3Handler(self.cdn_bucket) # pragma: no cover
self.cdn_handler = S3Handler(self.cdn_bucket) # pragma: no cover
if 'dynamodb_handler' in kwargs:
self.db_handler = kwargs['dynamodb_handler']
else:
self.db_handler = DynamoDBHandler('{}d43-catalog-status'.format(self.stage_prefix())) # pragma: no cover
self.db_handler = DynamoDBHandler('{}d43-catalog-status'.format(self.stage_prefix())) # pragma: no cover
if 'url_handler' in kwargs:
self.get_url = kwargs['url_handler']
else:
self.get_url = get_url # pragma: no cover
self.get_url = get_url # pragma: no cover
if 'download_handler' in kwargs:
self.download_file = kwargs['download_handler']
else:
self.download_file = download_file # pragma: no cover
self.download_file = download_file # pragma: no cover
if 'url_exists_handler' in kwargs:
self.url_exists = kwargs['url_exists_handler']
else:
self.url_exists = url_exists # pragma: no cover
self.url_exists = url_exists # pragma: no cover

self.temp_dir = tempfile.mkdtemp('', 'tsv2', None)

Expand Down Expand Up @@ -180,8 +179,9 @@ def __execute(self):
if process_id not in self.status['processed']:
self.logger.debug('Processing {}'.format(process_id))
obs_json = index_obs(lid, rid, format, self.temp_dir, self.download_file)
upload = prep_data_upload('{}/{}/{}/v{}/source.json'.format(pid, lid, rid, res['version']),
obs_json, self.temp_dir)
upload = prep_data_upload(
'{}/{}/{}/v{}/source.json'.format(pid, lid, rid, res['version']),
obs_json, self.temp_dir)
self._upload(upload)
finished_processes[process_id] = []
else:
Expand Down Expand Up @@ -212,7 +212,8 @@ def __execute(self):
if finished_processes:
self.status['processed'].update(finished_processes)
self.status['timestamp'] = time.strftime("%Y-%m-%dT%H:%M:%SZ")
self.db_handler.update_item({'api_version': TsV2CatalogHandler.api_version}, self.status)
self.db_handler.update_item({'api_version': TsV2CatalogHandler.api_version},
self.status)

if not rc_format:
raise Exception('Could not find a format for {}_{}_{}'.format(lid, rid, pid))
Expand All @@ -222,7 +223,9 @@ def __execute(self):

if modified is None:
modified = time.strftime('%Y%m%d')
self.logger.warning('Could not find date modified for {}_{}_{} from "{}"'.format(lid, rid, pid, rc_format['modified']))
self.logger.warning('Could not find date modified for {}_{}_{} from "{}"'.format(lid, rid, pid,
rc_format[
'modified']))

if rc_type == 'book' or rc_type == 'bundle':
self._build_catalog_node(cat_dict, lang, res, project, modified)
Expand Down Expand Up @@ -275,21 +278,21 @@ def __execute(self):
api_uploads.append(prep_data_upload('{}/{}/resources.json'.format(pid, lid), res_cat, self.temp_dir))

del lang['_res']
if('project' in lang):
if ('project' in lang):
# skip empty artifacts
lang_cat.append(lang)
else:
self.logger.warning('Excluding empty language artifact in {}'.format(pid))
api_uploads.append(prep_data_upload('{}/languages.json'.format(pid), lang_cat, self.temp_dir))

del project['_langs']
del project['_langs']
root_cat.append(project)
catalog_upload = prep_data_upload('catalog.json', root_cat, self.temp_dir)
api_uploads.append(catalog_upload)
# TRICKY: also upload to legacy path for backwards compatibility
api_uploads.append({
'key':'/ts/txt/2/catalog.json',
'path':catalog_upload['path']
'key': '/ts/txt/2/catalog.json',
'path': catalog_upload['path']
})

# upload files
Expand Down Expand Up @@ -359,8 +362,9 @@ def _index_note_files(self, lid, rid, format, process_id):
if not rc_dir: return {}

tn_uploads = index_tn_rc(lid=lid,
temp_dir=self.temp_dir,
rc_dir=rc_dir)
temp_dir=self.temp_dir,
rc_dir=rc_dir,
reporter=self)

return tn_uploads

Expand Down Expand Up @@ -432,7 +436,6 @@ def _index_question_files(self, lid, rid, format, process_id):

return tq_uploads


def _index_words_files(self, lid, rid, format, process_id):
"""
Returns an array of markdown files found in a tW dictionary
Expand All @@ -446,7 +449,8 @@ def _index_words_files(self, lid, rid, format, process_id):
obs_example_re = re.compile('\_*\[([^\[\]]+)\]\(([^\(\)]+)\)_*(.*)', re.UNICODE | re.IGNORECASE)
block_re = re.compile('^##', re.MULTILINE | re.UNICODE)
word_links_re = re.compile('\[([^\[\]]+)\]\(\.\.\/(kt|other)\/([^\(\)]+)\.md\)', re.UNICODE | re.IGNORECASE)
ta_html_re = re.compile('(<a\s+href="(:[a-z-_0-9]+:ta:vol\d:[a-z-\_]+:[a-z-\_]+)"\s*>([^<]+)<\/a>)', re.UNICODE | re.IGNORECASE)
ta_html_re = re.compile('(<a\s+href="(:[a-z-_0-9]+:ta:vol\d:[a-z-\_]+:[a-z-\_]+)"\s*>([^<]+)<\/a>)',
re.UNICODE | re.IGNORECASE)

words = []
format_str = format['format']
Expand Down Expand Up @@ -515,7 +519,7 @@ def _index_words_files(self, lid, rid, format, process_id):
word_content = '##'.join(cleaned_blocks)

# find all tW links and use them in related words
related_words = [w[2] for w in word_links_re.findall(word_content) ]
related_words = [w[2] for w in word_links_re.findall(word_content)]

# convert links to legacy form. TODO: we should convert links after converting to html so we don't have to do it twice.
word_content = convert_rc_links(word_content)
Expand All @@ -528,7 +532,8 @@ def _index_words_files(self, lid, rid, format, process_id):
word_content = word_content.replace(ta_link[0], new_link)

words.append({
'aliases': [a.strip() for a in title.split(',') if a.strip() != word_id and a.strip() != title.strip()],
'aliases': [a.strip() for a in title.split(',') if
a.strip() != word_id and a.strip() != title.strip()],
'cf': related_words,
'def': word_content,
'def_title': def_title.rstrip(':'),
Expand Down Expand Up @@ -586,8 +591,10 @@ def _process_usfm(self, lid, rid, resource, format):
# convert USX to JSON
path = os.path.normpath(os.path.join(usx_dir, '{}.usx'.format(pid.upper())))
source = build_json_source_from_usx(path, format['modified'], self)
upload = prep_data_upload('{}/{}/{}/v{}/source.json'.format(pid, lid, rid, resource['version']), source['source'], self.temp_dir)
self.cdn_handler.upload_file(upload['path'], '{}/{}'.format(TsV2CatalogHandler.cdn_root_path, upload['key']))
upload = prep_data_upload('{}/{}/{}/v{}/source.json'.format(pid, lid, rid, resource['version']),
source['source'], self.temp_dir)
self.cdn_handler.upload_file(upload['path'],
'{}/{}'.format(TsV2CatalogHandler.cdn_root_path, upload['key']))

self.status['processed'][process_id] = []
self.status['timestamp'] = time.strftime("%Y-%m-%dT%H:%M:%SZ")
Expand Down Expand Up @@ -708,7 +715,7 @@ def _build_catalog_node(self, catalog, language, resource, project, modified):

# resource
res = catalog[pid]['_langs'][lid]['_res'][rid]
r_modified = max_modified_date(res, modified) # TRICKY: dates bubble up from project
r_modified = max_modified_date(res, modified) # TRICKY: dates bubble up from project
comments = '' # TRICKY: comments are not officially supported in RCs but we use them if available
if 'comment' in resource: comments = resource['comment']

Expand All @@ -717,8 +724,8 @@ def _build_catalog_node(self, catalog, language, resource, project, modified):
if rid != 'obs':
chunks_url = 'https://api.unfoldingword.org/bible/txt/1/{}/chunks.json'.format(pid)
# if not self.url_exists(chunks_url) and 'chunks_url' in project:
# Use the v3 api chunks url if the legacy version cannot be found
# chunks_url = project['chunks_url']
# Use the v3 api chunks url if the legacy version cannot be found
# chunks_url = project['chunks_url']

source_url = '{}/{}/{}/{}/{}/v{}/source.json?date_modified={}'.format(
self.cdn_url,
Expand Down Expand Up @@ -774,10 +781,10 @@ def _build_catalog_node(self, catalog, language, resource, project, modified):

# language
lang = catalog[pid]['_langs'][lid]
l_modified = max_modified_date(lang['language'], r_modified) # TRICKY: dates bubble up from resource
l_modified = max_modified_date(lang['language'], r_modified) # TRICKY: dates bubble up from resource
description = ''
if rid == 'obs': description = resource['description']
project_meta = list(project['categories']) # default to category ids
project_meta = list(project['categories']) # default to category ids
if 'category_labels' in language:
project_meta = []
for cat_id in project['categories']:
Expand All @@ -798,7 +805,9 @@ def _build_catalog_node(self, catalog, language, resource, project, modified):
'meta': project_meta,
'name': project['title']
},
'res_catalog': '{}/{}/{}/{}/resources.json?date_modified={}'.format(self.cdn_url, TsV2CatalogHandler.cdn_root_path, pid, lid, l_modified)
'res_catalog': '{}/{}/{}/{}/resources.json?date_modified={}'.format(self.cdn_url,
TsV2CatalogHandler.cdn_root_path, pid,
lid, l_modified)
}
if 'ulb' == rid or 'udb' == rid:
cat_lang['project']['sort'] = '{}'.format(project['sort'])
Expand All @@ -808,7 +817,9 @@ def _build_catalog_node(self, catalog, language, resource, project, modified):
p_modified = max_modified_date(catalog[pid], l_modified)
catalog[pid].update({
'date_modified': p_modified,
'lang_catalog': '{}/{}/{}/languages.json?date_modified={}'.format(self.cdn_url, TsV2CatalogHandler.cdn_root_path, pid, p_modified),
'lang_catalog': '{}/{}/{}/languages.json?date_modified={}'.format(self.cdn_url,
TsV2CatalogHandler.cdn_root_path, pid,
p_modified),
'meta': project['categories'],
'slug': pid,
'sort': '{}'.format(project['sort']).zfill(2)
Expand Down
11 changes: 9 additions & 2 deletions libraries/tools/ts_v2_utils.py
Expand Up @@ -102,7 +102,15 @@ def index_tn_rc(lid, temp_dir, rc_dir, reporter=None):
general_notes = note_general_re.search(verse_body)

# close chunk
if firstvs is not None and (pid == 'obs' or verse in chunk_json[chapter]):
chapter_key = chapter
if firstvs is not None and (pid != 'obs' and chapter_key not in chunk_json):
if reporter:
reporter.report_error('Could not find chunk data for {} {} {}'.format(rc_dir, pid, chapter_key))
# attempt to recover if Psalms
if pid == 'psa':
chapter_key = chapter_key.zfill(3)

if firstvs is not None and (pid == 'obs' or verse in chunk_json[chapter_key]):
note_json.append({
'id': '{}-{}'.format(chapter, firstvs),
'tn': notes
Expand Down Expand Up @@ -147,7 +155,6 @@ def index_tn_rc(lid, temp_dir, rc_dir, reporter=None):

return tn_uploads


def prep_data_upload(key, data, temp_dir):
"""
Prepares some data for upload to s3
Expand Down

0 comments on commit a76b9f7

Please sign in to comment.