the-paperless-project · danielquinn · Dec 1, 2018 · Nov 16, 2018 · Nov 16, 2018 · Nov 16, 2018
diff --git a/docs/guesswork.rst b/docs/guesswork.rst
@@ -43,6 +43,16 @@ These however wouldn't work:
 * ``Some Company Name, Invoice 2016-01-01, money, invoices.pdf``
 * ``Another Company- Letter of Reference.jpg``
 
+Do I have to be so strict about naming?
+---------------------------------------
+Rather than using the strict document naming rules, one can also set the option
+``PAPERLESS_FILENAME_DATE_ORDER`` in ``paperless.conf`` to any date order
+that is accepted by dateparser_. Doing so will cause ``paperless`` to default
+to any date format that is found in the title, instead of a date pulled from
+the document's text, without requiring the strict formatting of the document
+filename as described above.
+
+.. _dateparser: https://github.com/scrapinghub/dateparser/blob/v0.7.0/docs/usage.rst#settings
 
 .. _guesswork-content:
 

diff --git a/paperless.conf.example b/paperless.conf.example
@@ -127,6 +127,14 @@ PAPERLESS_DEBUG="false"
 # "true", the document will instead be opened in the browser, if possible.
 #PAPERLESS_INLINE_DOC="false"
 
+# By default, paperless will check the document text for document date information.
+# Uncomment the line below to enable checking the document filename for date
+# information. The date order can be set to any option as specified in
+# https://dateparser.readthedocs.io/en/latest/#settings. The filename will be
+# checked first, and if nothing is found, the document text will be checked
+# as normal.
+#PAPERLESS_FILENAME_DATE_ORDER="YMD"
+
 #
 # The following values use sensible defaults for modern systems, but if you're
 # running Paperless on a low-resource device (like a Raspberry Pi), modifying

diff --git a/src/documents/parsers.py b/src/documents/parsers.py
@@ -14,14 +14,18 @@
 # - XX.YY.ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits
 # - XX/YY/ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits
 # - XX-YY-ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits
+# - ZZZZ.XX.YY with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits
+# - ZZZZ/XX/YY with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits
+# - ZZZZ-XX-YY with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits
 # - XX. MONTH ZZZZ with XX being 1 or 2 and ZZZZ being 2 or 4 digits
 # - MONTH ZZZZ, with ZZZZ being 4 digits
 # - MONTH XX, ZZZZ with XX being 1 or 2 and ZZZZ being 4 digits
 DATE_REGEX = re.compile(
-    r'\b([0-9]{1,2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{4}|[0-9]{2})\b|' +
-    r'\b([0-9]{1,2}[\. ]+[^ ]{3,9} ([0-9]{4}|[0-9]{2}))\b|' +
-    r'\b([^\W\d_]{3,9} [0-9]{1,2}, ([0-9]{4}))\b|' +
-    r'\b([^\W\d_]{3,9} [0-9]{4})\b'
+    r'(\b|(?!=([_-])))([0-9]{1,2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{4}|[0-9]{2})(\b|(?=([_-])))|' +
+    r'(\b|(?!=([_-])))([0-9]{4}|[0-9]{2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{1,2})(\b|(?=([_-])))|' +
+    r'(\b|(?!=([_-])))([0-9]{1,2}[\. ]+[^ ]{3,9} ([0-9]{4}|[0-9]{2}))(\b|(?=([_-])))|' +
+    r'(\b|(?!=([_-])))([^\W\d_]{3,9} [0-9]{1,2}, ([0-9]{4}))(\b|(?=([_-])))|' +
+    r'(\b|(?!=([_-])))([^\W\d_]{3,9} [0-9]{4})(\b|(?=([_-])))'
 )
 
 
@@ -37,6 +41,7 @@ class DocumentParser:
 
     SCRATCH = settings.SCRATCH_DIR
     DATE_ORDER = settings.DATE_ORDER
+    FILENAME_DATE_ORDER = settings.FILENAME_DATE_ORDER
     OPTIPNG = settings.OPTIPNG_BINARY
 
     def __init__(self, path):
@@ -75,30 +80,53 @@ def get_date(self):
         Returns the date of the document.
         """
 
+        def __parser__(ds, date_order):
+            """
+            Call dateparser.parse with a particular date ordering
+            """
+            return dateparser.parse(ds,
+                                    settings={"DATE_ORDER": date_order,
+                                              "PREFER_DAY_OF_MONTH": "first",
+                                              "RETURN_AS_TIMEZONE_AWARE":
+                                                  True})
         date = None
         date_string = None
 
+        next_year = timezone.now().year + 5  # Arbitrary 5 year future limit
+        title = os.path.basename(self.document_path)
+
+        # if filename date parsing is enabled, search there first:
+        if self.FILENAME_DATE_ORDER:
+            self.log("info", "Checking document title for date")
+            for m in re.finditer(DATE_REGEX, title):
+                date_string = m.group(0)
+
+                try:
+                    date = __parser__(date_string, self.FILENAME_DATE_ORDER)
+                except TypeError:
+                    # Skip all matches that do not parse to a proper date
+                    continue
+
+                if date is not None and next_year > date.year > 1900:
+                    self.log("info",
+                             "Detected document date {} based on string {} "
+                             "from document title"
+                             "".format(date.isoformat(), date_string))
+                    return date
+
         try:
+            # getting text after checking filename will save time if only
+            # looking at the filename instead of the whole text
             text = self.get_text()
         except ParseError:
             return None
 
-        next_year = timezone.now().year + 5  # Arbitrary 5 year future limit
-
-        # Iterate through all regex matches and try to parse the date
+        # Iterate through all regex matches in text and try to parse the date
         for m in re.finditer(DATE_REGEX, text):
-
             date_string = m.group(0)
 
             try:
-                date = dateparser.parse(
-                    date_string,
-                    settings={
-                        "DATE_ORDER": self.DATE_ORDER,
-                        "PREFER_DAY_OF_MONTH": "first",
-                        "RETURN_AS_TIMEZONE_AWARE": True
-                    }
-                )
+                date = __parser__(date_string, self.DATE_ORDER)
             except TypeError:
                 # Skip all matches that do not parse to a proper date
                 continue

diff --git a/src/paperless/settings.py b/src/paperless/settings.py
@@ -306,6 +306,7 @@ def __get_boolean(key, default="NO"):
 
 # Specify the default date order (for autodetected dates)
 DATE_ORDER = os.getenv("PAPERLESS_DATE_ORDER", "DMY")
+FILENAME_DATE_ORDER = os.getenv("PAPERLESS_FILENAME_DATE_ORDER")
 
 # Specify for how many years a correspondent is considered recent. Recent
 # correspondents will be shown in a separate "Recent correspondents" filter as

diff --git a/src/paperless_tesseract/tests/samples/2013-12-11_tests_date_in_filename_2.pdf b/src/paperless_tesseract/tests/samples/2013-12-11_tests_date_in_filename_2.pdf
diff --git a/src/paperless_tesseract/tests/samples/2013-12-11_tests_date_in_filename_2.png b/src/paperless_tesseract/tests/samples/2013-12-11_tests_date_in_filename_2.png
diff --git a/src/paperless_tesseract/tests/samples/tests_date_in_filename_2018-03-20_1.pdf b/src/paperless_tesseract/tests/samples/tests_date_in_filename_2018-03-20_1.pdf
diff --git a/src/paperless_tesseract/tests/samples/tests_date_in_filename_2018-03-20_1.png b/src/paperless_tesseract/tests/samples/tests_date_in_filename_2018-03-20_1.png