Skip to content

Commit

Permalink
Merge "timestripper: prevent recognizing components too far from each…
Browse files Browse the repository at this point in the history
… other"
  • Loading branch information
jenkins-bot authored and Gerrit Code Review committed Dec 3, 2016
2 parents c766b1c + f86235a commit 5a67df7
Show file tree
Hide file tree
Showing 2 changed files with 45 additions and 15 deletions.
38 changes: 23 additions & 15 deletions pywikibot/textlib.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,11 @@
'or': u'୦୧୨୩୪୫୬୭୮୯',
}

# Used in TimeStripper. When a timestamp-like line have longer gaps
# than this between year, month, etc in it, then the line will not be
# considered to contain a timestamp.
TIMESTAMP_GAP_LIMIT = 10


def to_local_digits(phrase, lang):
"""
Expand Down Expand Up @@ -1977,17 +1982,19 @@ def marker(m):
return (txt, None)

@staticmethod
def _valid_date_dict_order(dateDict):
def _valid_date_dict_positions(dateDict):
"""Check consistency of reasonable positions for groups."""
day_pos = dateDict['day']['pos']
month_pos = dateDict['month']['pos']
year_pos = dateDict['year']['pos']
time_pos = dateDict['time']['pos']
tzinfo_pos = dateDict['tzinfo']['pos']

date_pos = sorted((day_pos, month_pos, year_pos))
min_pos, max_pos = date_pos[0], date_pos[-1]

time_pos = dateDict['time']['start']
tzinfo_pos = dateDict['tzinfo']['start']
date_pos = sorted(
(dateDict['day'], dateDict['month'], dateDict['year']),
key=lambda x: x['start'])
min_pos, max_pos = date_pos[0]['start'], date_pos[-1]['start']
max_gap = max(x[1]['start'] - x[0]['end']
for x in zip(date_pos, date_pos[1:]))

if max_gap > TIMESTAMP_GAP_LIMIT:
return False
if tzinfo_pos < min_pos or tzinfo_pos < time_pos:
return False
if min_pos < tzinfo_pos < max_pos:
Expand Down Expand Up @@ -2028,15 +2035,16 @@ def timestripper(self, line):
line, match_obj = self._last_match_and_replace(line, pat)
if match_obj:
for group, value in match_obj.groupdict().items():
pos = match_obj.start(group)
# Store also match pos in line, for later order check.
matchDict = {group: {'value': value, 'pos': pos}}
dateDict.update(matchDict)
start, end = (match_obj.start(group), match_obj.end(group))
# The positions are stored for later validation
dateDict[group] = {
'value': value, 'start': start, 'end': end
}

# all fields matched -> date valid
# groups are in a reasonable order.
if (all(g in dateDict for g in self.groups) and
self._valid_date_dict_order(dateDict)):
self._valid_date_dict_positions(dateDict)):
# remove 'time' key, now split in hour/minute and not needed
# by datetime.
del dateDict['time']
Expand Down
22 changes: 22 additions & 0 deletions tests/timestripper_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,28 @@ def test_last_match_and_replace(self):
)


class TestTimeStripperNumberAndDate(TestTimeStripperCase):

"""Test cases for lines with (non-year) numbers and timestamps."""

family = 'wikipedia'
code = 'en'

def test_four_digit_is_not_year_with_no_timestamp(self):
"""A 4-digit number should not be mistaken as year (w/o timestamp)."""
self.assertIsNone(
self.ts.timestripper(
'2000 people will meet on 16 December at 22:00 (UTC).'))

def test_four_digit_is_not_year_with_timestamp(self):
"""A 4-digit number should not be mistaken as year (w/ timestamp)."""
self.assertEqual(
self.ts.timestripper(
'2000 people will attend. --12:12, 14 December 2015 (UTC)'),
datetime.datetime(
2015, 12, 14, 12, 12, tzinfo=tzoneFixedOffset(0, 'UTC')))


class TestTimeStripperLanguage(TestCase):

"""Test cases for English language."""
Expand Down

0 comments on commit 5a67df7

Please sign in to comment.