Skip to content
This repository has been archived by the owner on Feb 19, 2021. It is now read-only.

Commit

Permalink
Tweak the date guesser to not allow dates prior to 1900 (#414)
Browse files Browse the repository at this point in the history
  • Loading branch information
danielquinn committed Oct 1, 2018
1 parent a511d34 commit 8010d72
Show file tree
Hide file tree
Showing 2 changed files with 31 additions and 7 deletions.
25 changes: 18 additions & 7 deletions src/paperless_tesseract/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -203,6 +203,7 @@ def _assemble_ocr_sections(self, imgs, middle, text):
return text

def get_date(self):

date = None
datestring = None

Expand All @@ -217,20 +218,30 @@ def get_date(self):

try:
date = dateparser.parse(
datestring,
settings={'DATE_ORDER': self.DATE_ORDER,
'PREFER_DAY_OF_MONTH': 'first',
'RETURN_AS_TIMEZONE_AWARE': True})
datestring,
settings={
"DATE_ORDER": self.DATE_ORDER,
"PREFER_DAY_OF_MONTH": "first",
"RETURN_AS_TIMEZONE_AWARE": True
}
)
except TypeError:
# Skip all matches that do not parse to a proper date
continue

if date is not None:
if date is not None and date.year > 1900:
break
else:
date = None

if date is not None:
self.log("info", "Detected document date " + date.isoformat() +
" based on string " + datestring)
self.log(
"info",
"Detected document date {} based on string {}".format(
date.isoformat(),
datestring
)
)
else:
self.log("info", "Unable to detect date for document")

Expand Down
13 changes: 13 additions & 0 deletions src/paperless_tesseract/tests/test_date.py
Original file line number Diff line number Diff line change
Expand Up @@ -384,3 +384,16 @@ def test_get_text_9_pdf(self):
document.get_date(),
datetime.datetime(2017, 12, 31, 0, 0, tzinfo=tz.tzutc())
)

@mock.patch(
"paperless_tesseract.parsers.RasterisedDocumentParser.get_text",
return_value="01-07-0590 00:00:00"
)
@mock.patch(
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
SCRATCH
)
def test_crazy_date(self, *args):
document = RasterisedDocumentParser("/dev/null")
document.get_text()
self.assertIsNone(document.get_date())

0 comments on commit 8010d72

Please sign in to comment.