Permalink
Browse files

Next generation html2vimdoc - just a start

  • Loading branch information...
1 parent 156fb53 commit 8fd076453513bdc7604cc38381d73f139d7d1ebe @xolox committed May 22, 2013
Showing with 403 additions and 0 deletions.
  1. +278 −0 html2vimdoc-ng.py
  2. 0 libs/__init__.py
  3. +125 −0 libs/soupselect.py
View
@@ -0,0 +1,278 @@
+#!/usr/bin/env python
+
+"""
+An object oriented replacement for html2vimdoc.py which supports nested block
+level elements and other nasty complexities (in converting HTML to Vim's plain
+text help file format).
+
+1. Parse input HTML using ``BeautifulSoup``
+2. Build an AST of everything we know how to convert, drop the rest
+3. Recurse into the AST to perform the conversion to Vim help file?
+
+"""
+
+# Standard library modules.
+import re
+import sys
+import textwrap
+
+# External dependency, install with:
+# sudo apt-get install python-beautifulsoup
+# pip install beautifulsoup
+from BeautifulSoup import BeautifulSoup, NavigableString
+
+# External dependency, bundled because it's not on PyPi.
+import libs.soupselect as soupselect
+
+TEXT_WIDTH = 79
+SHIFT_WIDTH = 2
+
+def main():
+ filename = 'demo/lpeg-0.10.html'
+ filename = 'demo/apr-0.17.html'
+ with open(filename) as handle:
+ print Converter().convert(handle.read()).encode('utf-8')
+
+class Converter(object):
+
+ def convert(self, html, selector='#content'):
+ """
+ Convert HTML source code to the Vim help file format.
+ """
+ # TODO Use soupselect to remove unwanted bits of HTML (e.g. Lua/APR test coverage info).
+ html = self.decode_hexadecimal_entities(html)
+ tree = BeautifulSoup(html, convertEntities=BeautifulSoup.ALL_ENTITIES)
+ root = self.find_root_node(tree, selector)
+ extract = self.simplify_tree(root)
+ # TODO Shift headings so top level headings have level 1?
+ sys.stderr.write("Parse tree:\n%s\n" % extract)
+ return extract.render(level=0)
+
+ def decode_hexadecimal_entities(self, html):
+ """
+ Based on my testing BeautifulSoup doesn't support hexadecimal HTML
+ entities, so we have to decode them ourselves :-(
+ """
+ # If we happen to decode an entity into one of these characters, we
+ # should never insert it literally into the HTML because we'll screw
+ # up the syntax.
+ unsafe_to_decode = {
+ '<': '&lt;',
+ '>': '&gt;',
+ '"': '&quot;',
+ "'": '&apos;',
+ '&': '&amp;',
+ }
+ def decode_entity(match):
+ character = chr(int(match.group(1), 16))
+ return unsafe_to_decode.get(character, character)
+ return re.sub(r'&#x([0-9A-Fa-f]+);', decode_entity, html)
+
+ def find_root_node(self, tree, selector):
+ """
+ Given a document tree generated by BeautifulSoup, find the most
+ specific document node that doesn't "lose any information" (i.e.
+ everything that we want to be included in the Vim help file) while
+ ignoring as much fluff as possible (e.g. headers, footers and
+ navigation menus).
+ """
+ # Try to find the root node using a CSS selector provided by the caller.
+ matches = soupselect.select(tree, selector)
+ if matches:
+ return matches[0]
+ # Otherwise we'll fall back to the <body> element.
+ try:
+ return tree.html.body
+ except:
+ # Don't break when html.body doesn't exist.
+ return tree
+
+ def simplify_tree(self, tree):
+ """
+ Simplify the tree generated by BeautifulSoup into something we can
+ easily generate a Vim help file from.
+ """
+ # TODO Ignore <head>; find <body>.
+ return self.simplify_node(tree)
+
+ def simplify_node(self, node, block_level=True):
+ name = getattr(node, 'name', None)
+ if name in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6'):
+ return Heading(level=int(name[1]),
+ contents=self.simplify_children(node, block_level=False))
+ elif name == 'p':
+ return Paragraph(contents=self.simplify_children(node, block_level=False))
+ elif name == 'pre':
+ return PreformattedText(contents=[''.join(node.findAll(text=True))])
+ elif name in ('ul', 'ol'):
+ return List(contents=self.simplify_children(node, block_level=block_level),
+ ordered=(name == 'ol'))
+ elif name == 'li':
+ return ListItem(contents=self.simplify_children(node, block_level=block_level))
+ elif name == 'table':
+ return Table(contents=self.simplify_children(node, block_level=block_level))
+ elif isinstance(node, NavigableString):
+ return Text(node.string)
+ else:
+ return self.simplify_children(node, block_level=block_level)
+
+ def simplify_children(self, node, block_level):
+ contents = []
+ for child in getattr(node, 'contents', []):
+ simplified_child = self.simplify_node(child, block_level=block_level)
+ if simplified_child:
+ contents.append(simplified_child)
+ if block_level:
+ return BlockLevelSequence(contents=contents)
+ else:
+ return InlineSequence(contents=contents)
+
+# Abstract parse tree nodes.
+
+class Node(object):
+
+ def __init__(self, **kw):
+ for name, value in kw.iteritems():
+ setattr(self, name, value)
+
+ def __repr__(self):
+ children = ", ".join(repr(c) for c in self.contents)
+ return "%s(%s)" % (self.__class__.__name__, children)
+
+ def indent(self, text, level):
+ if isinstance(text, list):
+ text = "\n".join(text)
+ prefix = " " * (level * SHIFT_WIDTH)
+ return "\n".join(prefix + line for line in text.splitlines())
+
+class BlockLevelNode(Node):
+ pass
+
+class InlineNode(Node):
+ pass
+
+# Concrete parse tree nodes.
+
+class BlockLevelSequence(BlockLevelNode):
+
+ def __iter__(self):
+ return iter(self.contents)
+
+ def render(self, level):
+ return join_blocks(self.contents, level=level)
+
+class Heading(BlockLevelNode):
+
+ def render(self, level):
+ # Join the inline child nodes together into a single string.
+ text = join_inline(self.contents, level=level)
+ # Wrap the heading's text. The two character difference is " ~", the
+ # suffix used to mark Vim help file headings.
+ lines = [line + " ~" for line in textwrap.wrap(text, width=TEXT_WIDTH - 2)]
+ # Add a line with the marker symbol for headings, repeated on the full
+ # line, at the top of the heading.
+ lines.insert(0, ('=' if level == 1 else '-') * 79)
+ return "\n".join(lines)
+
+class Paragraph(BlockLevelNode):
+
+ def render(self, level):
+ indent = " " * (level * SHIFT_WIDTH)
+ return "\n".join(textwrap.wrap(join_inline(self.contents, level=level),
+ width=TEXT_WIDTH,
+ initial_indent=indent,
+ subsequent_indent=indent))
+
+class PreformattedText(BlockLevelNode):
+
+ def render(self, level):
+ indent = " " * (level + 4)
+ # Remove common indentation from the original text.
+ sys.stderr.write("self.contents = %r\n" % self.contents)
+ text = textwrap.dedent(self.contents[0])
+ # Remove leading/trailing empty lines.
+ lines = text.splitlines()
+ while lines and not lines[0].strip():
+ lines.pop(0)
+ while lines and not lines[-1].strip():
+ lines.pop(-1)
+ # Indent the original text.
+ output = []
+ for line in lines:
+ output.append(indent + line)
+ # Add a Vim help file marker indicating the preformatted text.
+ output.insert(0, ">")
+ return "\n".join(output)
+
+class List(BlockLevelNode):
+
+ def render(self, level):
+ items = []
+ delimiter = '\n'
+ for i, node in enumerate(self.contents, start=1):
+ if isinstance(node, ListItem):
+ bullet = '%i. ' % i if self.ordered else '- '
+ text = node.render(level=level + (len(bullet) / SHIFT_WIDTH))
+ items.append(bullet + text.lstrip())
+ if '\n' in text:
+ delimiter = '\n\n'
+ return delimiter.join(items)
+
+class ListItem(BlockLevelNode):
+
+ def render(self, level):
+ return join_blocks(self.contents, level=level)
+
+class Table(BlockLevelNode):
+
+ def render(self, level):
+ return ''
+
+class InlineSequence(InlineNode):
+
+ def __iter__(self):
+ return iter(self.contents)
+
+ def render(self, level):
+ return join_inline(self.contents, level=level)
+
+class Text(InlineNode):
+
+ def __init__(self, value):
+ self.contents = [value]
+
+ def render(self, level):
+ return self.contents[0]
+
+def join_blocks(nodes, level):
+ """
+ Join a sequence of block level nodes into a single string.
+ """
+ output = ''
+ for node in nodes:
+ text = node.render(level=level)
+ if text and not text.isspace():
+ if not output:
+ output = text
+ elif isinstance(node, PreformattedText):
+ output += '\n' + text
+ else:
+ output += '\n\n' + text
+ return output
+
+def join_inline(nodes, level):
+ """
+ Join a sequence of inline nodes into a single string.
+ """
+ return compact("".join(n.render(level=level) for n in nodes))
+
+def compact(text):
+ """
+ Compact whitespace in a string (also trims whitespace from the sides).
+ """
+ return " ".join(text.split())
+
+if __name__ == '__main__':
+ main()
+
+# vim: ft=python ts=4 sw=4 et
View
No changes.
Oops, something went wrong.

0 comments on commit 8fd0764

Please sign in to comment.