Skip to content

Commit

Permalink
Allow Pandoc to parse metadata rather than doing it in the plugin.
Browse files Browse the repository at this point in the history
This involves a fairly complicated dance with a Pandoc "filter"
module in order to get all of the metadata to be visible in the
output, but means that all metadata formats supported by Pandoc
are available without the need for any additional Python modules.
It also means strings in metadata will be processed as Markdown.

NOTE: Thanks to jgm/pandoc#2026 and
backward compatibility constraints, this change defaults to
enabling 'mmd_title_block' and *disabling* 'pandoc_title_block' and
'yaml_metadata_block'.  Moreover, putting either +pandoc_title_block or
+yaml_metadata_block in PANDOC_EXTENSIONS will cause mmd_title_block to
be disabled.
  • Loading branch information
zackw committed Mar 30, 2015
1 parent 9ef0197 commit ccffc27
Show file tree
Hide file tree
Showing 3 changed files with 249 additions and 35 deletions.
19 changes: 12 additions & 7 deletions README.md
@@ -1,25 +1,23 @@
pandoc_reader
=============

A pandoc [markdown] reader plugin for [pelican]
A pandoc [markdown][] reader plugin for [pelican][]


Requirements
------------

- [pandoc] in $PATH

- [pandoc][] in `$PATH`

Installation
------------

Instructions for installation of pelican plugins can be obtained from the [pelican plugin manual](https://github.com/getpelican/pelican-plugins/blob/master/Readme.rst).


Configuration
-------------

Additional command line parameters can be passed to pandoc via the PANDOC_ARGS parameter.
Additional command line parameters can be passed to pandoc via the `PANDOC_ARGS` parameter.

PANDOC_ARGS = [
'--mathjax',
Expand All @@ -29,14 +27,19 @@ Additional command line parameters can be passed to pandoc via the PANDOC_ARGS p
'--number-sections',
]

Pandoc's markdown extensions can be enabled or disabled via the
PANDOC_EXTENSIONS parameter.
Pandoc's syntactic extensions to Markdown can be enabled or disabled via the
`PANDOC_EXTENSIONS` parameter.

PANDOC_EXTENSIONS = [
'+hard_line_breaks',
'-citations'
]

File Metadata
-------------

For compatibility with older versions of this plugin that parsed MultiMarkdown-like title blocks internally, the [`mmd_title_block`][mmd_title_block] syntax extension is enabled by default. Unfortunately, this causes Pandoc to misinterpret YAML metadata and possibly also native title blocks (see [Pandoc issue 2026][]). Therefore, those metadata formats are *disabled* by default. To revert to Pandoc's default behavior (accepting native title blocks and YAML metadata, but not MMD title blocks), include `-mmd_title_block` in `PANDOC_EXTENSIONS`.

Contributing
------------

Expand All @@ -50,3 +53,5 @@ Contributing
[markdown]: http://daringfireball.net/projects/markdown/
[pandoc]: http://johnmacfarlane.net/pandoc/
[pelican]: http://getpelican.com
[mmd_title_block]: http://johnmacfarlane.net/pandoc/README.html#extension-mmd_title_block
[Pandoc issue 2026]: https://github.com/jgm/pandoc/issues/2026
47 changes: 47 additions & 0 deletions embed_metadata_filter.py
@@ -0,0 +1,47 @@
# This is a filter script which embeds all of the metadata parsed by
# Pandoc into the HTML output, where the main body of the reader can
# pick it up. In order to preserve Pandoc's translation of Markdown
# in metadata values, we convert the metadata structure into an HTML
# tree structure. A <hr> separates the translated metadata from the
# document itself.
#
# See http://johnmacfarlane.net/pandoc/scripting.html for documentation
# of the JSON-serialized AST that we are manipulating.

import json
import sys

def N(t, c, cls=None):
if cls is not None: c = [ ["", [cls], []], c ]
return { "t": t, "c": c }

def cvt_metainlines(c):
return N("Plain", [N("Span", c, "metavalue")])

def cvt_metamap(c):
return N("DefinitionList", [ ( [N("Str", key)], [[ convert(val) ]] )
for key, val in sorted(c.items()) ])

CONVERTERS = {
"MetaMap": cvt_metamap,
"MetaInlines": cvt_metainlines,
"MetaBool": lambda c: cvt_metainlines([N("Str", str(c).lower())]),
"MetaString": lambda c: cvt_metainlines([N("Str", c)]),
"MetaBlocks": lambda c: N("Div", c, "metavalue"),
"MetaList": lambda c: N("BulletList", [ [convert(item)] for item in c ])
}

def convert(item):
return CONVERTERS[item["t"]](item["c"])

def main():
blob = json.load(sys.stdin)
metadata = blob[0]['unMeta']
rendered = [cvt_metamap(metadata), N("HorizontalRule", [])]
rendered.extend(blob[1])
blob = [blob[0], rendered]
json.dump(blob, sys.stdout, separators=(',',':'))

# This filter script is imported by pandoc_reader in order to learn its
# actual filename, so don't do anything unless invoked as __main__.
if __name__ == '__main__': main()
218 changes: 190 additions & 28 deletions pandoc_reader.py
@@ -1,44 +1,206 @@
import subprocess
import sys

import logging
logger = logging.getLogger(__name__)

try: import xml.etree.cElementTree as ET
except ImportError: import xml.etree.ElementTree as ET

try: from io import StringIO
except ImportError: from cStringIO import StringIO

from pelican import signals
from pelican.readers import BaseReader
from pelican.utils import pelican_open

from . import embed_metadata_filter

def check_command(proc, cmd):
"""Roughly as subprocess.check_call does, wait for PROC and throw
an exception if it didn't exit successfully. CMD should be the
command passed to subprocess.Popen."""
status = proc.wait()
if status:
raise subprocess.CalledProcessError(status, cmd)

def extract_metadata(text):
"""A filter script converts Pandoc's internal representation of the
metadata into an HTML tree structure so that it will make it to
the output, with strings properly formatted. Separate that
tree from the HTML for the document itself, and decode it into
Pelican's desired representation."""

def walk_dl(e):
rv = {}
key = None
for child in e:
if child.tag == "dt":
assert key is None
assert len(child) == 0
key = child.text
else:
assert child.tag == "dd"
assert key is not None
assert len(child) == 1
rv[key] = walk(child[0])
key = None
return rv

def walk_ul(e):
rv = []
for child in e:
assert child.tag == "li"
assert len(child) == 1
rv.append(walk(child[0]))
return rv

def walk_value(e):
assert e.get("class") == "metavalue"
# Setting e.tag and e.tail to None temporarily seems to be the
# least-hassle way to persuade ET.tostring to dump the *contents*
# of e but not e itself.
tag = e.tag
tail = e.tail
try:
e.tag = None
e.tail = None
return (ET.tostring(e, encoding="utf-8", method="html")
.decode("utf-8").strip())
finally:
e.tag = tag
e.tail = tail

def walk(e):
if e.tag == "dl":
return walk_dl(e)
elif e.tag == "ul":
return walk_ul(e)
elif e.tag == "div" or e.tag == "span":
return walk_value(e)
else:
logger.error("unexpected metadata structure: " +
ET.tostring(e, encoding="utf-8", method="html")
.decode("utf-8"))


metadata, _, document = text.partition("<hr />")
document = document.strip()

# Remove namespaces from all metadata elements while parsing them.
# This is necessary because Pandoc thinks you have to put an
# xmlns= on every use of <math>, and that makes ET.tostring
# generate tags like <ns0:math>, which an HTML (not XHTML) parser
# will not understand.
it = ET.iterparse(StringIO(metadata))
for _, el in it:
if "}" in el.tag:
el.tag = el.tag.split("}", 1)[1]

assert it.root.tag == "dl"
return document, walk(it.root)

class PandocReader(BaseReader):
enabled = True
file_extensions = ['md', 'markdown', 'mkd', 'mdown']
file_extensions = ["md", "markdown", "mkd", "mdown"]

def read(self, filename):
with pelican_open(filename) as fp:
text = list(fp.splitlines())
def memoize_settings(self):
"""Load settings and compute the various subprocess invocations we
will be using."""
if hasattr(self, "pd_extensions"): return

metadata = {}
for i, line in enumerate(text):
kv = line.split(':', 1)
if len(kv) == 2:
name, value = kv[0].lower(), kv[1].strip()
metadata[name] = self.process_metadata(name, value)
else:
content = "\n".join(text[i:])
break
extra_args = self.settings.get("PANDOC_ARGS", [])

pos_extensions = set()
neg_extensions = set()
for ext in self.settings.get("PANDOC_EXTENSIONS", []):
if len(ext) >= 2:
if ext[0] == "-":
neg_extensions.add(ext[1:])
continue
elif ext[0] == "+":
pos_extensions.add(ext[1:])
continue
logger.error("invalid PANDOC_EXTENSIONS item {!r}".format(ext))

# For compatibility with older versions of this plugin that
# parsed vaguely MMD-style metadata blocks themselves, we
# default to +mmd_title_block. Unfortunately,
# +mmd_title_block causes Pandoc to mis-parse YAML and
# possibly also native title blocks (see
# https://github.com/jgm/pandoc/issues/2026). Therefore,
# if there's nothing about title blocks in PANDOC_EXTENSIONS,
# we also explicitly disable YAML and native title blocks.

if ("mmd_title_block" not in pos_extensions and
"mmd_title_block" not in neg_extensions and
"pandoc_title_block" not in pos_extensions and
"pandoc_title_block" not in neg_extensions and
"yaml_metadata_block" not in pos_extensions and
"yaml_metadata_block" not in neg_extensions):
pos_extensions.add("mmd_title_block")
neg_extensions.add("pandoc_title_block")
neg_extensions.add("yaml_metadata_block")

extra_args = self.settings.get('PANDOC_ARGS', [])
extensions = self.settings.get('PANDOC_EXTENSIONS', '')
if isinstance(extensions, list):
extensions = ''.join(extensions)
both_exts = pos_extensions & neg_extensions
if both_exts:
logger.error("Pandoc syntax extensions both enabled and disabled: "
+ " ".join(sorted(both_exts)))
pos_extensions -= both_exts
neg_extensions -= both_exts

pandoc_cmd = ["pandoc", "--from=markdown" + extensions, "--to=html5"]
pandoc_cmd.extend(extra_args)
syntax = "markdown"
if pos_extensions:
syntax += "".join(sorted("+"+ext for ext in pos_extensions))
if neg_extensions:
syntax += "".join(sorted("-"+ext for ext in neg_extensions))

proc = subprocess.Popen(pandoc_cmd,
stdin = subprocess.PIPE,
stdout = subprocess.PIPE)
pd_cmd_1 = ["pandoc", "-f", syntax, "-t", "json"]
pd_cmd_2 = ["pandoc", "-f", "json", "-t", "html5"]
# We don't know whether the extra_args are relevant to the reader or
# writer, and it is harmless to supply them to both.
pd_cmd_1.extend(extra_args)
pd_cmd_2.extend(extra_args)

output = proc.communicate(content.encode('utf-8'))[0].decode('utf-8')
status = proc.wait()
if status:
raise subprocess.CalledProcessError(status, pandoc_cmd)
self.pd_cmd_1 = pd_cmd_1
self.pd_cmd_2 = pd_cmd_2
self.filt_cmd = [sys.executable, embed_metadata_filter.__file__]
logger.debug("Reader command: " + " ".join(self.pd_cmd_1))
logger.debug("Writer command: " + " ".join(self.pd_cmd_2))
logger.debug("Filter command: " + " ".join(self.filt_cmd))

def read(self, filename):
self.memoize_settings()

# We do not use --filter because that requires the filter to
# be directly executable. By constructing a pipeline by hand
# we can use sys.executable and not worry about #! lines or
# execute bits.
PIPE = subprocess.PIPE
fp = None
p1 = None
p2 = None
p3 = None
try:
fp = open(filename, "rb")
p1 = subprocess.Popen(self.pd_cmd_1, stdin=fp, stdout=PIPE)
p2 = subprocess.Popen(self.filt_cmd, stdin=p1.stdout, stdout=PIPE)
p3 = subprocess.Popen(self.pd_cmd_2, stdin=p2.stdout, stdout=PIPE)

text = p3.stdout.read().decode("utf-8")

finally:
if fp is not None: fp.close()
if p1 is not None: check_command(p1, self.pd_cmd_1)
if p2 is not None: check_command(p2, self.filt_cmd)
if p3 is not None: check_command(p3, self.pd_cmd_2)

document, raw_metadata = extract_metadata(text)
metadata = {}
for k, v in raw_metadata.items():
k = k.lower()
metadata[k] = self.process_metadata(k, v)

return output, metadata
return document, metadata

def add_reader(readers):
for ext in PandocReader.file_extensions:
Expand Down

0 comments on commit ccffc27

Please sign in to comment.