Feature 8/add tq to md2html conversion (#185)

* Adding tQ support. * TqPreprocessor & TestTqPreprocessor - Adding tQ support. * TqLinter & TestMd2HtmlConverter - Adding tQ support. * merge changes from tw * TqLinter - fix link urls * TestMd2HtmlConverter - finished test_tq() * ProjectDeployerTests - added test_tq_deploy_revision_to_door43() * TqPreprocessor - fix links * TqPreprocessor - adding index.json * TqPreprocessor - fix for generating index.json * TqPreprocessor - added chapter numbers for books * TqPreprocessor - fixed header level of book title. Added book title before chapter headers. * TqPreprocessor - Added chunk headers. * TestConversions - Added tq test. TqLinter - removed unnecessary internal link validation. * TqPreprocessor & TqLinter- added warning on missing files * TqPreprocessor - added toc * TqPreprocessor - added toc * TqPreprocessor - added toc * TqLinter & TestTqLinter - improving missing book testing * TqPreprocessor & TestTqPreprocessor - removed passing repo name
unfoldingWord-dev · Oct 17, 2017 · 25caedc · 25caedc
1 parent 982bcdd
commit 25caedc
Show file tree

Hide file tree

Showing 18 changed files with 599 additions and 28 deletions.
diff --git a/functions/convert_md2html/module.json b/functions/convert_md2html/module.json
@@ -2,7 +2,7 @@
     "name": "md2html",
     "version": "2",
     "type": "converter",
-    "resource_types": ["obs", "ta"],
+    "resource_types": ["obs", "ta", "tq"],
     "input_format": ["md"],
     "output_format": ["html"],
     "options": [],

diff --git a/libraries/client/client_webhook.py b/libraries/client/client_webhook.py
@@ -266,15 +266,15 @@ def upload_build_log_to_s3(self, build_log, s3_commit_key, part=''):
     def create_build_log(self, commit_id, commit_message, commit_url, compare_url, job, pusher_username, repo_name,
                          repo_owner):
         """
-        :param string commit_id: 
-        :param string commit_message: 
-        :param string commit_url: 
-        :param string compare_url: 
-        :param TxJob job: 
-        :param string pusher_username: 
-        :param string repo_name: 
-        :param string repo_owner: 
-        :return dict: 
+        :param string commit_id:
+        :param string commit_message:
+        :param string commit_url:
+        :param string compare_url:
+        :param TxJob job:
+        :param string pusher_username:
+        :param string repo_name:
+        :param string repo_owner:
+        :return dict:
         """
         build_log_json = dict(job)
         build_log_json['repo_name'] = repo_name
@@ -345,7 +345,7 @@ def send_request_to_converter(self, job, converter):
         """
         :param TxJob job:
         :param TxModule converter:
-        :return bool: 
+        :return bool:
         """
         payload = {
             'identifier': job.identifier,
@@ -470,18 +470,18 @@ def get_converter_module(self, job):
         :param TxJob job:
         :return TxModule:
         """
-        return TxModule.query().filter(TxModule.type=='converter')\
-            .filter(TxModule.input_format.contains(job.input_format))\
-            .filter(TxModule.output_format.contains(job.output_format))\
-            .filter(TxModule.resource_types.contains(job.resource_type))\
+        return TxModule.query().filter(TxModule.type=='converter') \
+            .filter(TxModule.input_format.contains(job.input_format)) \
+            .filter(TxModule.output_format.contains(job.output_format)) \
+            .filter(TxModule.resource_types.contains(job.resource_type)) \
             .first()
 
     def get_linter_module(self, job):
         """
         :param TxJob job:
         :return TxModule:
         """
-        linters = TxModule.query().filter(TxModule.type=='linter')\
+        linters = TxModule.query().filter(TxModule.type=='linter') \
             .filter(TxModule.input_format.contains(job.input_format))
         linter = linters.filter(TxModule.resource_types.contains(job.resource_type)).first()
         if not linter:

diff --git a/libraries/client/preprocessors.py b/libraries/client/preprocessors.py
@@ -3,6 +3,7 @@
 import re
 from glob import glob
 from shutil import copy
+from libraries.app.app import App
 from libraries.door43_tools.bible_books import BOOK_NUMBERS
 from libraries.general_tools.file_utils import write_file, read_file
 from libraries.resource_container.ResourceContainer import RC
@@ -11,12 +12,19 @@
 
 def do_preprocess(rc, repo_dir, output_dir):
     if rc.resource.identifier == 'obs':
+        App.logger.debug("do_preprocess: using ObsPreprocessor")
         preprocessor = ObsPreprocessor(rc, repo_dir, output_dir)
     elif rc.resource.identifier in BIBLE_RESOURCE_TYPES:
+        App.logger.debug("do_preprocess: using BiblePreprocessor")
         preprocessor = BiblePreprocessor(rc, repo_dir, output_dir)
     elif rc.resource.identifier == 'ta':
+        App.logger.debug("do_preprocess: using TaPreprocessor")
         preprocessor = TaPreprocessor(rc, repo_dir, output_dir)
+    elif rc.resource.identifier == 'tq':
+        App.logger.debug("do_preprocess: using TqPreprocessor")
+        preprocessor = TqPreprocessor(rc, repo_dir, output_dir)
     else:
+        App.logger.debug("do_preprocess: using Preprocessor")
         preprocessor = Preprocessor(rc, repo_dir, output_dir)
     return preprocessor.run(), preprocessor
 
@@ -63,10 +71,13 @@ def run(self):
                 else:
                     # Case #3: The project path is multiple chapters, so we piece them together
                     chapters = self.rc.chapters(project.identifier)
+                    App.logger.debug("Merging chapters in '{0}'".format(project.identifier))
                     if len(chapters):
                         text = ''
                         for chapter in chapters:
+                            text = self.mark_chapter(project.identifier, chapter, text)
                             for chunk in self.rc.chunks(project.identifier, chapter):
+                                text = self.mark_chunk(project.identifier, chapter, chunk, text)
                                 text += read_file(os.path.join(project_path, chapter, chunk))+"\n\n"
                         if project.identifier.lower() in BOOK_NUMBERS:
                             filename = '{0}-{1}.{2}'.format(BOOK_NUMBERS[project.identifier.lower()],
@@ -77,6 +88,12 @@ def run(self):
                         write_file(os.path.join(self.output_dir, filename), text)
         return True
 
+    def mark_chapter(self, ident, chapter, text):
+        return text  # default does nothing to text
+
+    def mark_chunk(self, ident, chapter, chunk, text):
+        return text  # default does nothing to text
+
     def is_multiple_jobs(self):
         return False
 
@@ -415,3 +432,229 @@ def fix_links(self, content):
         content = re.sub(r'([^A-Z0-9"(/])(www\.[A-Z0-9/?&_.:=#-]+[A-Z0-9/?&_:=#-])', r'\1[\2](http://\2)',
                          content, flags=re.IGNORECASE)
         return content
+
+
+class TqPreprocessor(Preprocessor):
+    sections = [
+        {'book': "00-toc", 'title': 'Table of Contents'},
+        {'book': "01-GEN", 'title': 'Genesis'},
+        {'book': "02-EXO", 'title': 'Exodus'},
+        {'book': "03-LEV", 'title': 'Leviticus'},
+        {'book': "04-NUM", 'title': 'Numbers'},
+        {'book': "05-DEU", 'title': 'Deuteronomy'},
+        {'book': "06-JOS", 'title': 'Joshua'},
+        {'book': "07-JDG", 'title': 'Judges'},
+        {'book': "08-RUT", 'title': 'Ruth'},
+        {'book': "09-1SA", 'title': '1 Samuel'},
+        {'book': "10-2SA", 'title': '2 Samuel'},
+        {'book': "11-1KI", 'title': '1 Kings'},
+        {'book': "12-2KI", 'title': '2 Kings'},
+        {'book': "13-1CH", 'title': '1 Chronicles'},
+        {'book': "14-2CH", 'title': '2 Chronicles'},
+        {'book': "15-EZR", 'title': 'Ezra'},
+        {'book': "16-NEH", 'title': 'Nehemiah'},
+        {'book': "17-EST", 'title': 'Esther'},
+        {'book': "18-JOB", 'title': 'Job'},
+        {'book': "19-PSA", 'title': 'Psalms'},
+        {'book': "20-PRO", 'title': 'Proverbs'},
+        {'book': "21-ECC", 'title': 'Ecclesiastes'},
+        {'book': "22-SNG", 'title': 'Song of Solomon'},
+        {'book': "23-ISA", 'title': 'Isaiah'},
+        {'book': "24-JER", 'title': 'Jeremiah'},
+        {'book': "25-LAM", 'title': 'Lamentations'},
+        {'book': "26-EZK", 'title': 'Ezekiel'},
+        {'book': "27-DAN", 'title': 'Daniel'},
+        {'book': "28-HOS", 'title': 'Hosea'},
+        {'book': "29-JOL", 'title': 'Joel'},
+        {'book': "30-AMO", 'title': 'Amos'},
+        {'book': "31-OBA", 'title': 'Obadiah'},
+        {'book': "32-JON", 'title': 'Jonah'},
+        {'book': "33-MIC", 'title': 'Micah'},
+        {'book': "34-NAM", 'title': 'Nahum'},
+        {'book': "35-HAB", 'title': 'Habakkuk'},
+        {'book': "36-ZEP", 'title': 'Zephaniah'},
+        {'book': "37-HAG", 'title': 'Haggai'},
+        {'book': "38-ZEC", 'title': 'Zechariah'},
+        {'book': "39-MAL", 'title': 'Malachi'},
+        {'book': "41-MAT", 'title': 'Matthew'},
+        {'book': "42-MRK", 'title': 'Mark'},
+        {'book': "43-LUK", 'title': 'Luke'},
+        {'book': "44-JHN", 'title': 'John'},
+        {'book': "45-ACT", 'title': 'Acts'},
+        {'book': "46-ROM", 'title': 'Romans'},
+        {'book': "47-1CO", 'title': '1 Corinthians'},
+        {'book': "48-2CO", 'title': '2 Corinthians'},
+        {'book': "49-GAL", 'title': 'Galatians'},
+        {'book': "50-EPH", 'title': 'Ephesians'},
+        {'book': "51-PHP", 'title': 'Philippians'},
+        {'book': "52-COL", 'title': 'Colossians'},
+        {'book': "53-1TH", 'title': '1 Thessalonians'},
+        {'book': "54-2TH", 'title': '2 Thessalonians'},
+        {'book': "55-1TI", 'title': '1 Timothy'},
+        {'book': "56-2TI", 'title': '2 Timothy'},
+        {'book': "57-TIT", 'title': 'Titus'},
+        {'book': "58-PHM", 'title': 'Philemon'},
+        {'book': "59-HEB", 'title': 'Hebrews'},
+        {'book': "60-JAS", 'title': 'James'},
+        {'book': "61-1PE", 'title': '1 Peter'},
+        {'book': "62-2PE", 'title': '2 Peter'},
+        {'book': "63-1JN", 'title': '1 John'},
+        {'book': "64-2JN", 'title': '2 John'},
+        {'book': "65-3JN", 'title': '3 John'},
+        {'book': "66-JUD", 'title': 'Jude'},
+        {'book': "67-REV", 'title': 'Revelation'},
+    ]
+
+    def __init__(self, *args, **kwargs):
+        super(TqPreprocessor, self).__init__(*args, **kwargs)
+        self.section_container_id = 1
+        self.toc = ''
+        self.index_json = None
+        self.section_header_marker = '###############'
+
+    def mark_chapter(self, ident, chapter, text):
+        a = '{0} {1}\n\n'.format(self.section_header_marker, chapter)  # put in invalid header for section - we will correct heading level later
+        return text + a
+
+    def mark_chunk(self, ident, chapter, chunk, text):
+        chunk_marker = os.path.splitext(chunk)[0]
+        a = '{0}# {1}:{2}\n\n'.format(self.section_header_marker, chapter, chunk_marker)  # put in invalid header for section - we will correct heading level later
+        return text + a
+
+    def compile_section(self, title, link, content):
+        """
+        Recursive section markdown creator
+
+        :param content:
+        :param link:
+        :param title:
+        :return:
+        """
+        level = 3
+        markdown = ''
+        level_increase = ('#' * level)
+        markdown += '{0} <a id="{1}"/>{2}\n\n'.format('#' * (level-2), link, title)  # add book title
+        content = content.replace('\r', '')
+        lines = content.split('\n')
+        section_header_length = len(self.section_header_marker)
+        for i in range(0, len(lines)):
+            line = lines[i]
+            if line[:section_header_length] == self.section_header_marker:
+                text = line[section_header_length:]
+                if text[0] == '#':  # check if chunk marker
+                    line = level_increase + ' ' + title + text[1:]  # fix header level and add title
+                else:  # chapter marker
+                    line = '#' * (level-1) + ' ' + title + text  # fix header level and add title
+                lines[i] = line
+            elif line and (line[0] == '#'):
+                if line.rstrip()[-1] == '#':
+                    line = level_increase + line.rstrip() + level_increase
+                else:
+                    line = level_increase + line
+                lines[i] = line
+        content = '\n'.join(lines)
+        markdown += content + '\n\n---\n\n'  # horizontal rule
+        return markdown
+
+    def run(self):
+        super(TqPreprocessor, self).run()
+        self.toc = None
+        projects = {}
+        self.index_json = {
+            'titles': {},
+            'chapters': {},
+            'book_codes': {}
+        }
+        for idx, project in enumerate(self.rc.projects):
+            section = self.get_section_for_file(project.identifier)
+            if section:
+                link = self.get_link_for_section(section)
+                book = section['book']
+                if not self.toc:
+                    self.toc = '# Table of Contents:\n\n'
+                projects[book] = {
+                    'link': link,
+                }
+            else:
+                App.logger.debug('TqPreprocessor: extra project found: {0}'.format(project.identifier))
+
+        for section in TqPreprocessor.sections:  # index by book order
+            book = section['book']
+            if book in projects:
+                file = os.path.join(self.output_dir, book + '.md')
+                link = self.get_link_for_section(section)
+                book = section['book']
+                title = section['title']
+                if not os.path.exists(file):
+                    App.logger.debug('TqPreprocessor: book missing: {0}'.format(book))
+                    continue
+                initial_markdown = read_file(file)
+                markdown = self.compile_section(title, link, initial_markdown)
+                markdown = self.fix_links(markdown, book)
+                if initial_markdown != markdown:
+                    write_file(file, markdown)
+                self.toc += '* [{1}](./{0}.html)\n'.format(book, title)
+                self.index_json['titles'][book + '.html'] = title
+            else:
+                App.logger.debug('TqPreprocessor: missing book: {0}'.format(book))
+
+        self.toc = self.fix_links(self.toc, '-')
+        output_file = os.path.join(self.output_dir, '00-toc.md')
+        write_file(output_file, self.toc)
+        self.index_json['titles']['00-toc.html'] = 'Table of Contents'
+        output_file = os.path.join(self.output_dir, 'index.json')
+        write_file(output_file, self.index_json)
+
+        # Copy the toc and config.yaml file to the output dir so they can be used to
+        # generate the ToC on live.door43.org
+        toc_file = os.path.join(self.source_dir, project.path, 'toc.yaml')
+        if os.path.isfile(toc_file):
+            copy(toc_file, os.path.join(self.output_dir, 'toc.yaml'))
+        config_file = os.path.join(self.source_dir, project.path, 'config.yaml')
+        if os.path.isfile(config_file):
+            copy(config_file, os.path.join(self.output_dir, 'config.yaml'))
+        return True
+
+    def fix_links(self, content, section_link):
+        if not content:
+            return content
+
+        # convert RC links, e.g. rc://en/tn/help/1sa/16/02 => https://git.door43.org/Door43/en_tn/1sa/16/02.md
+        content = re.sub(r'rc://([^/]+)/([^/]+)/([^/]+)/([^\s)\]\n$]+)',
+                         r'https://git.door43.org/{0}/\1_\2/src/master/\4.md'.format(self.rc.repo_name), content,
+                         flags=re.IGNORECASE)
+        # fix links to other sections within the same manual (only one ../ and a section name that matches section_link)
+        # e.g. [covenant](../kt/covenant.md) => [covenant](#covenant)
+        pattern = r'\]\(\.\.\/{0}\/([^/]+).md\)'.format(section_link)
+        content = re.sub(pattern, r'](#\1)', content)
+        # fix links to other sections within the same manual (only one ../ and a section name)
+        # e.g. [commit](../other/commit.md) => [commit](other.html#commit)
+        for section in TqPreprocessor.sections:
+            link = self.get_link_for_section(section)
+            pattern = re.compile(r'\]\(\.\./{0}/([^/]+).md\)'.format(link))
+            replace = r']({0}.html#\1)'.format(link)
+            content = re.sub(pattern, replace, content)
+        # fix links to other sections that just have the section name but no 01.md page (preserve http:// links)
+        # e.g. See [Verbs](figs-verb) => See [Verbs](#figs-verb)
+        content = re.sub(r'\]\(([^# :/)]+)\)', r'](#\1)', content)
+        # convert URLs to links if not already
+        content = re.sub(r'([^"(])((http|https|ftp)://[A-Z0-9/?&_.:=#-]+[A-Z0-9/?&_:=#-])', r'\1[\2](\2)',
+                         content, flags=re.IGNORECASE)
+        # URLS wth just www at the start, no http
+        content = re.sub(r'([^A-Z0-9"(/])(www\.[A-Z0-9/?&_.:=#-]+[A-Z0-9/?&_:=#-])', r'\1[\2](http://\2)',
+                         content, flags=re.IGNORECASE)
+        return content
+
+    def get_section_for_file(self, id):
+        id = id.lower()
+        for section in TqPreprocessor.sections:
+            if (id == section['book'].lower()) or (id == self.get_link_for_section(section)):
+                return section
+        return None
+
+    def get_link_for_section(self, section):
+        link = section['book']
+        parts = link.split('-')
+        if len(parts) > 1:
+            link = parts[1].lower()
+        return link
diff --git a/libraries/converters/converter.py b/libraries/converters/converter.py
@@ -1,4 +1,5 @@
 from __future__ import print_function, unicode_literals
+import json
 import os
 import tempfile
 import traceback
@@ -132,7 +133,7 @@ def do_callback(self, url, payload):
         if url.startswith('http'):
             headers = {"content-type": "application/json"}
             App.logger.debug('Making callback to {0} with payload:'.format(url))
-            App.logger.debug(payload)
+            App.logger.debug(json.dumps(payload)[:256])
             response = requests.post(url, json=payload, headers=headers)
             self.callback_status = response.status_code
             if (self.callback_status >= 200) and (self.callback_status < 299):

diff --git a/libraries/converters/md2html_converter.py b/libraries/converters/md2html_converter.py
@@ -73,7 +73,7 @@ def convert_markdown(self):
                 # Convert files that are markdown files
                 with codecs.open(filename, 'r', 'utf-8-sig') as md_file:
                     md = md_file.read()
-                if self.resource == 'ta':
+                if self.resource in ['ta']:
                     html = markdown2.markdown(md, extras=['markdown-in-html', 'tables'])
                 else:
                     html = markdown.markdown(md)