Skip to content

Commit

Permalink
Merge pull request #580 from jerome-poisson/SD-5475
Browse files Browse the repository at this point in the history
[NITF] [FORMATTER] HTML 2 NITF conversion
  • Loading branch information
jerome-poisson committed Sep 26, 2016
2 parents 8a09677 + d619fda commit cc00473
Show file tree
Hide file tree
Showing 2 changed files with 164 additions and 0 deletions.
151 changes: 151 additions & 0 deletions superdesk/publish/formatters/nitf_formatter.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,76 @@ class NITFFormatter(Formatter):
_schema_ref = 'http://www.iptc.org/std/NITF/3.6/specification/nitf-3-6.xsd'
_debug_message_extra = {
'schemaLocation': '{} {}'.format(_schema_uri, _schema_ref)}
NITF_COMMON_ATTR = ('id', 'class', 'style')
NITF_ALLOWED_ATTR = {
'p': NITF_COMMON_ATTR + ('lede', 'summary', 'optional-tex'),
'a': NITF_COMMON_ATTR + ('href', 'name', 'rel', 'rev', 'title'),
'br': ('id',),
'em': NITF_COMMON_ATTR,
'q': NITF_COMMON_ATTR,
'hl1': NITF_COMMON_ATTR,
'hl2': NITF_COMMON_ATTR,
'table': NITF_COMMON_ATTR + (
'tabletype',
'align',
'width',
'cols',
'border',
'frame',
'rules',
'cellspacing',
'cellpadding'),
'nitf-table': ('id',),
'ol': NITF_COMMON_ATTR + ('seqnum',),
'ul': NITF_COMMON_ATTR,
'li': NITF_COMMON_ATTR,
'dl': NITF_COMMON_ATTR,
'dt': NITF_COMMON_ATTR,
'dd': NITF_COMMON_ATTR,
'bq': NITF_COMMON_ATTR + ('nowrap', 'quote-source'),
'fn': NITF_COMMON_ATTR,
'note': NITF_COMMON_ATTR + ('noteclass', 'type'),
'pre': NITF_COMMON_ATTR,
'sup': NITF_COMMON_ATTR,
'sub': NITF_COMMON_ATTR,
'hr': NITF_COMMON_ATTR,
}

HTML2NITF = {
'p': {},
'b': {
'nitf': 'em',
'attrib': {'class': 'bold'}},
'strong': {
'nitf': 'em',
'attrib': {'class': 'bold'}},
'i': {
'nitf': 'em',
'attrib': {'class': 'italic'}},
'em': {
'nitf': 'em',
'attrib': {'class': 'italic'}},
'u': {
'nitf': 'em',
'attrib': {'class': 'underscore'}},
'strike': {'nitf': 'em'},
'sup': {},
'sub': {},
'a': {},
'img': {'nitf': ''}, # <img> use <media> in nitf, so we remove element
'blockquote': {'nitf': 'bq'},
'pre': {},
'ol': {},
'ul': {},
'li': {},
# FIXME: hl1 is not used here as it can only appear in <hedline>
'h1': {'nitf': 'hl2'},
'h2': {'nitf': 'hl2'},
'h3': {'nitf': 'hl2'},
'h4': {'nitf': 'hl2'},
'h5': {'nitf': 'hl2'},
'h6': {'nitf': 'hl2'},
}

def format(self, article, subscriber, codes=None):
try:
Expand Down Expand Up @@ -60,6 +130,87 @@ def get_nitf(self, article, destination, pub_seq_num):
self._format_body_end(article, body_end)
return nitf

def _textToParent(self, parent, children, idx, text):
"""copy Element.text to parent or sibling element
for internal use only
"""
# we copy text to sibling element's tail
# or to parent text if it is the first element
if idx > 0:
sibling = children[idx - 1]
sibling.tail = (sibling.tail or '') + text
else:
parent.text = (parent.text or '') + text

def html2nitf(self, html_elem, root_elem=True):
"""convert HTML elements to NITF compatible elements
:param ET.Element: HTML to clean/transform
:param bool: True if its the main element (i.e. must no be deleted)
:return ET.Element: NITF compliant element
"""
if html_elem.tag not in self.HTML2NITF:
if root_elem:
html_elem.tag = 'p'
else:
raise ValueError("Unhandled HTML element")
else:
nitf_map = self.HTML2NITF[html_elem.tag]
nitf_elem = nitf_map.get('nitf')
if nitf_elem is not None:
if nitf_elem == '':
raise ValueError("Element need to be removed")
html_elem.tag = nitf_elem

html_elem.attrib.update(nitf_map.get('attrib', {}))

attr_allowed = self.NITF_ALLOWED_ATTR.get(html_elem.tag, ())

for attr in list(html_elem.attrib):
if attr not in attr_allowed:
del html_elem.attrib[attr]

children = list(html_elem)
idx = 0
while idx < len(children):
child = children[idx]
try:
self.html2nitf(child, root_elem=False)
except ValueError:
# the element is unknown
# we need to save its text and tail,
# and put its children to parent
grandchildren = list(child)

if child.text:
self._textToParent(html_elem, children, idx, child.text)

if child.tail:
# we copy tail to last grandchild tail
# or we append to parent/sibling if there is no grandchild
if grandchildren:
last = grandchildren[-1]
last.tail = (last.tail or '') + child.tail
else:
self._textToParent(html_elem, children, idx, child.tail)

# we move elem children to parent
for grandchild_idx, grandchild in grandchildren:
insert_idx = idx + grandchild_idx
html_elem.insert(insert_idx, grandchild)
children.insert(insert_idx, grandchild)

# and remove the element
html_elem.remove(child)
children.remove(child)

# and we continue with the same index, so new children are parsed
continue
idx += 1

return html_elem

def _format_tobject(self, article, head):
return SubElement(head, 'tobject', {'tobject.type': 'news'})

Expand Down
13 changes: 13 additions & 0 deletions tests/publish/nitf_formatter_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,19 @@ def test_formatter(self):
self.assertEqual(nitf_xml.find('body/body.content/p').text, 'test body')
self.assertEqual(nitf_xml.find('head/docdata/urgency').get('ed-urg'), '2')

def test_html2nitf(self):
html = etree.fromstring(
'<p><strong>this text should be <i>modified</i></strong> so '
'<span>[this should not be removed]</span> unkown <em unknown_attribute="toto">'
'elements</em> and <a bad_attribute="to_remove">attributes</a> are <h6>'
'removed</h6></p>')

nitf = self.formatter.html2nitf(html)
expected = ('<p><em class="bold">this text should be <em class="italic">modified</em>'
'</em> so [this should not be removed] unkown <em class="italic">elements</em>'
' and <a>attributes</a> are <hl2>removed</hl2></p>')
self.assertEqual(etree.tostring(nitf, 'unicode'), expected)

def test_company_codes(self):
article = {
'guid': 'tag:aap.com.au:20150613:12345',
Expand Down

0 comments on commit cc00473

Please sign in to comment.