Skip to content

Commit

Permalink
Merge pull request #583 from marwoodandrew/sd-5505
Browse files Browse the repository at this point in the history
[SD-5055] PA NITF ingest Headline, Slugline and pubstatus handling
  • Loading branch information
Mayur Dhamanwala committed Sep 26, 2016
2 parents c031cc3 + 028c1cc commit 517c55f
Show file tree
Hide file tree
Showing 3 changed files with 105 additions and 3 deletions.
41 changes: 39 additions & 2 deletions superdesk/io/feed_parsers/pa_nitf.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,9 @@
# AUTHORS and LICENSE files distributed with this source code, or
# at https://www.sourcefabric.org/superdesk/license

from superdesk.io.feed_parsers.nitf import NITFFeedParser
from superdesk.io.feed_parsers.nitf import NITFFeedParser, SkipValue
from superdesk.io import register_feed_parser
import re


class PAFeedParser(NITFFeedParser):
Expand All @@ -33,8 +34,44 @@ def _category_mapping(self, elem):
return [{'qcode': 'V'}]
return [{'qcode': 'I'}]

def get_headline(self, xml):
"""
Return the headline if available if not then return the slugline (title)
:param xml:
:return:
"""
if xml.find('body/body.head/hedline/hl1') is not None:
return xml.find('body/body.head/hedline/hl1').text
else:
if xml.find('head/title') is not None:
return self._get_slugline(xml.find('head/title'))
raise SkipValue()

def _get_slugline(self, elem):
"""
Capitalize the first word of the slugline (Removing any leading digits's).
:param elem:
:return:
"""
# Remove any leading numbers and split to list of words
sluglineList = re.sub('^[\d.]+\W+', '', elem.text).split(' ')
slugline = sluglineList[0].capitalize()
if len(sluglineList) > 1:
slugline = '{} {}'.format(slugline, ' '.join(sluglineList[1:]))
return slugline

def _get_pubstatus(self, elem):
"""
Mark anything that is embargoed as usable, the editorial note still describes the embargo
:param elem:
:return:
"""
return 'usable' if elem.attrib['management-status'] == 'embargoed' else elem.attrib['management-status']

def __init__(self):
self.MAPPING = {'anpa_category': {'xpath': "head/meta/[@name='category']", 'filter': self._category_mapping}}
self.MAPPING = {'anpa_category': {'xpath': "head/meta/[@name='category']", 'filter': self._category_mapping},
'slugline': {'xpath': 'head/title', 'filter': self._get_slugline},
'pubstatus': {'xpath': 'head/docdata', 'filter': self._get_pubstatus}}
super().__init__()

register_feed_parser(PAFeedParser.NAME, PAFeedParser())
25 changes: 24 additions & 1 deletion tests/io/feed_parsers/pa_nitf_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,30 @@ class PAFileWithNoSubjects(PANITFFileTestCase):
filename = 'pa2.xml'

def test_headline(self):
self.assertEqual(self.item.get('headline'), '1 SOCCER INT-Teams')
self.assertEqual(self.item.get('headline'), 'Soccer INT-Teams')

def test_anpa_category(self):
self.assertEqual(self.item.get('anpa_category'), [{'qcode': 'S'}])


class PATestCase(PANITFFileTestCase):

filename = 'pa1.xml'

def test_slugline(self):
self.assertEqual(self.item.get('slugline'), 'Sport Trivia (Oct 14)')
self.assertEqual(self.item.get('headline'), 'PA SPORT TRIVIA (OCTOBER 14)')
self.assertEqual('usable', self.item.get('pubstatus'))
self.assertEqual('af1f7ad5-5619-49de-84cc-2e608538c77fSSS-3-1', self.item.get('guid'))
self.assertEqual(self.item.get('format'), 'HTML')
self.assertEqual(4, len(self.item.get('subject')))
self.assertIn('Trivia (Oct 14)', self.item.get('keywords'))
self.assertEqual(665, self.item.get('word_count'))


class PAEmbargoTestCase(PANITFFileTestCase):

filename = 'pa3.xml'

def test_slugline(self):
self.assertEqual(self.item.get('pubstatus'), 'usable')
42 changes: 42 additions & 0 deletions tests/io/fixtures/pa3.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
<?xml version="1.0" encoding="iso-8859-1"?>
<nitf version="-//IPTC//DTD NITF 3.1//EN">
<head>
<meta name="service" content="Comprehensive"/>
<title>1 POLITICS UnitedKingdom</title>
<meta name="category" content="HHH"/>
<meta name="optInfo" content=""/>
<tobject>
<tobject.subject tobject.subject.refnum="04015000"/>
</tobject>
<docdata management-status="embargoed">
<du-key key="PA-HHH-POLITICS-UnitedKingdom"/>
<doc-id id-string="4a82e12e-6b39-47eb-8e59-2c616a7e32a4HHH-13-1"/>
<urgency ed-urg="4"/>
<doc-scope scope="POLITICS"/>
<series series.part="1"/>
<key-list>
<keyword key="UnitedKingdom"/>
</key-list>
<del-list>
<from-src src-name="PA" level-number="1"/>
</del-list>
<date.issue norm="20160908T132446+0100"/>
<date.release norm="20160909T000100+0100"/>
<ed-msg info="(see also POLITICS EU)
Embargoed to 0001 Friday September 9"/>
<doc.copyright holder="Press Association" year="2016"/>
</docdata>
</head>
<body>
<body.head>
<hedline><hl1>UK BREAK-UP 'NOT INCONCEIVABLE', FIRST MINISTER WARNS AHEAD OF BREXIT TALKS</hl1></hedline>
<byline>By Benjamin Wright, Press Association</byline>
<abstract>
<p>The UK risks breaking</p>
</abstract>
</body.head>
<body.content>
<p>The UK risks breaking up unless all home nations play a hands-on role in the Brexit negotiations</p></body.content>
</body>
</nitf>

0 comments on commit 517c55f

Please sign in to comment.