Skip to content
This repository has been archived by the owner on Feb 19, 2021. It is now read-only.

ENH: Enable parsing of date from filename (without strict filename structure) #440

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
10 changes: 10 additions & 0 deletions docs/guesswork.rst
Expand Up @@ -43,6 +43,16 @@ These however wouldn't work:
* ``Some Company Name, Invoice 2016-01-01, money, invoices.pdf``
* ``Another Company- Letter of Reference.jpg``

Do I have to be so strict about naming?
---------------------------------------
Rather than using the strict document naming rules, one can also set the option
``PAPERLESS_FILENAME_DATE_ORDER`` in ``paperless.conf`` to any date order
that is accepted by dateparser_. Doing so will cause ``paperless`` to default
to any date format that is found in the title, instead of a date pulled from
the document's text, without requiring the strict formatting of the document
filename as described above.

.. _dateparser: https://github.com/scrapinghub/dateparser/blob/v0.7.0/docs/usage.rst#settings

.. _guesswork-content:

Expand Down
8 changes: 8 additions & 0 deletions paperless.conf.example
Expand Up @@ -127,6 +127,14 @@ PAPERLESS_DEBUG="false"
# "true", the document will instead be opened in the browser, if possible.
#PAPERLESS_INLINE_DOC="false"

# By default, paperless will check the document text for document date information.
# Uncomment the line below to enable checking the document filename for date
# information. The date order can be set to any option as specified in
# https://dateparser.readthedocs.io/en/latest/#settings. The filename will be
# checked first, and if nothing is found, the document text will be checked
# as normal.
#PAPERLESS_FILENAME_DATE_ORDER="YMD"

#
# The following values use sensible defaults for modern systems, but if you're
# running Paperless on a low-resource device (like a Raspberry Pi), modifying
Expand Down
60 changes: 44 additions & 16 deletions src/documents/parsers.py
Expand Up @@ -14,14 +14,18 @@
# - XX.YY.ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits
# - XX/YY/ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits
# - XX-YY-ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits
# - ZZZZ.XX.YY with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits
# - ZZZZ/XX/YY with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits
# - ZZZZ-XX-YY with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits
# - XX. MONTH ZZZZ with XX being 1 or 2 and ZZZZ being 2 or 4 digits
# - MONTH ZZZZ, with ZZZZ being 4 digits
# - MONTH XX, ZZZZ with XX being 1 or 2 and ZZZZ being 4 digits
DATE_REGEX = re.compile(
r'\b([0-9]{1,2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{4}|[0-9]{2})\b|' +
r'\b([0-9]{1,2}[\. ]+[^ ]{3,9} ([0-9]{4}|[0-9]{2}))\b|' +
r'\b([^\W\d_]{3,9} [0-9]{1,2}, ([0-9]{4}))\b|' +
r'\b([^\W\d_]{3,9} [0-9]{4})\b'
r'(\b|(?!=([_-])))([0-9]{1,2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{4}|[0-9]{2})(\b|(?=([_-])))|' +
r'(\b|(?!=([_-])))([0-9]{4}|[0-9]{2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{1,2})(\b|(?=([_-])))|' +
r'(\b|(?!=([_-])))([0-9]{1,2}[\. ]+[^ ]{3,9} ([0-9]{4}|[0-9]{2}))(\b|(?=([_-])))|' +
r'(\b|(?!=([_-])))([^\W\d_]{3,9} [0-9]{1,2}, ([0-9]{4}))(\b|(?=([_-])))|' +
r'(\b|(?!=([_-])))([^\W\d_]{3,9} [0-9]{4})(\b|(?=([_-])))'
)


Expand All @@ -37,6 +41,7 @@ class DocumentParser:

SCRATCH = settings.SCRATCH_DIR
DATE_ORDER = settings.DATE_ORDER
FILENAME_DATE_ORDER = settings.FILENAME_DATE_ORDER
OPTIPNG = settings.OPTIPNG_BINARY

def __init__(self, path):
Expand Down Expand Up @@ -75,30 +80,53 @@ def get_date(self):
Returns the date of the document.
"""

def __parser__(ds, date_order):
"""
Call dateparser.parse with a particular date ordering
"""
return dateparser.parse(ds,
settings={"DATE_ORDER": date_order,
"PREFER_DAY_OF_MONTH": "first",
"RETURN_AS_TIMEZONE_AWARE":
True})
date = None
date_string = None

next_year = timezone.now().year + 5 # Arbitrary 5 year future limit
title = os.path.basename(self.document_path)

# if filename date parsing is enabled, search there first:
if self.FILENAME_DATE_ORDER:
self.log("info", "Checking document title for date")
for m in re.finditer(DATE_REGEX, title):
date_string = m.group(0)

try:
date = __parser__(date_string, self.FILENAME_DATE_ORDER)
except TypeError:
# Skip all matches that do not parse to a proper date
continue

if date is not None and next_year > date.year > 1900:
self.log("info",
"Detected document date {} based on string {} "
"from document title"
"".format(date.isoformat(), date_string))
return date

try:
# getting text after checking filename will save time if only
# looking at the filename instead of the whole text
text = self.get_text()
except ParseError:
return None

next_year = timezone.now().year + 5 # Arbitrary 5 year future limit

# Iterate through all regex matches and try to parse the date
# Iterate through all regex matches in text and try to parse the date
for m in re.finditer(DATE_REGEX, text):

date_string = m.group(0)

try:
date = dateparser.parse(
date_string,
settings={
"DATE_ORDER": self.DATE_ORDER,
"PREFER_DAY_OF_MONTH": "first",
"RETURN_AS_TIMEZONE_AWARE": True
}
)
date = __parser__(date_string, self.DATE_ORDER)
except TypeError:
# Skip all matches that do not parse to a proper date
continue
Expand Down
1 change: 1 addition & 0 deletions src/paperless/settings.py
Expand Up @@ -306,6 +306,7 @@ def __get_boolean(key, default="NO"):

# Specify the default date order (for autodetected dates)
DATE_ORDER = os.getenv("PAPERLESS_DATE_ORDER", "DMY")
FILENAME_DATE_ORDER = os.getenv("PAPERLESS_FILENAME_DATE_ORDER")

# Specify for how many years a correspondent is considered recent. Recent
# correspondents will be shown in a separate "Recent correspondents" filter as
Expand Down
Binary file not shown.
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file not shown.
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.