diff --git a/src/documents/models.py b/src/documents/models.py index a71edf0f6..32c82b2e7 100644 --- a/src/documents/models.py +++ b/src/documents/models.py @@ -197,9 +197,16 @@ class Document(models.Model): TYPE_ODS = "ods" TYPE_ODT = "odt" TYPE_ODP = "odp" + TYPE_XLS = "xls" + TYPE_XLSX = "xlsx" + TYPE_DOC = "doc" + TYPE_DOCX = "docx" + TYPE_PPT = "ppt" + TYPE_PPTX = "pptx" TYPES = (TYPE_PDF, TYPE_PNG, TYPE_JPG, TYPE_GIF, TYPE_TIF, TYPE_TXT, TYPE_CSV, TYPE_MD, TYPE_ODS, TYPE_ODT, - TYPE_ODP) + TYPE_ODP, TYPE_XLS, TYPE_XLSX, TYPE_DOC, TYPE_DOCX, + TYPE_PPT, TYPE_PPTX) STORAGE_TYPE_UNENCRYPTED = "unencrypted" STORAGE_TYPE_GPG = "gpg" diff --git a/src/documents/views.py b/src/documents/views.py index 5b742ab22..8ee4f5425 100644 --- a/src/documents/views.py +++ b/src/documents/views.py @@ -54,11 +54,15 @@ def render_to_response(self, context, **response_kwargs): Document.TYPE_CSV: "text/csv", Document.TYPE_MD: "text/markdown", Document.TYPE_TXT: "text/plain", - Document.TYPE_ODS: - "application/vnd.oasis.opendocument.spreadsheet", - Document.TYPE_ODT: "application/vnd.oasis.opendocument.text", - Document.TYPE_ODP: - "application/vnd.oasis.opendocument.presentation" + Document.TYPE_ODS: "application/vnd.oasis.opendocument.spreadsheet", # NOQA: E501 + Document.TYPE_ODT: "application/vnd.oasis.opendocument.text", # NOQA: E501 + Document.TYPE_ODP: "application/vnd.oasis.opendocument.presentation", # NOQA: E501 + Document.TYPE_DOC: "application/msword", # NOQA: E501 + Document.TYPE_DOCX: "application/vnd.openxmlformats-officedocument.wordprocessingml.document", # NOQA: E501 + Document.TYPE_XLS: "application/vnd.ms-excel", # NOQA: E501 + Document.TYPE_XLSX: "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", # NOQA: E501 + Document.TYPE_PPT: "application/vnd.ms-powerpoint", # NOQA: E501 + Document.TYPE_PPTX: "application/vnd.openxmlformats-officedocument.presentationml.presentation", # NOQA: E501 } if self.kwargs["kind"] == "thumb": diff --git a/src/paperless_tika/parsers.py b/src/paperless_tika/parsers.py index 03e4bdbc3..3c6632bd4 100644 --- a/src/paperless_tika/parsers.py +++ b/src/paperless_tika/parsers.py @@ -12,7 +12,9 @@ class TikaDocumentParser(DocumentParser): """ - This parser uses Apache Tika to try and get some text out of + This parser uses Apache-Tika to try and get some text out of office + formats, whether it's a open-office (ODS, ODT, ODP), or ms-office + format (XLS, XLSX, DOC, DOCX, PPT, PPTX) """ CONVERT = settings.CONVERT_BINARY diff --git a/src/paperless_tika/signals.py b/src/paperless_tika/signals.py index 2a7b06c65..67687ea7b 100644 --- a/src/paperless_tika/signals.py +++ b/src/paperless_tika/signals.py @@ -5,7 +5,7 @@ class ConsumerDeclaration: - MATCHING_FILES = re.compile(r"^.*\.(ods|odt|odp)$") + MATCHING_FILES = re.compile(r"^.*\.(ods|odt|odp|xlsx?|docx?|pptx?)$") @classmethod def handle(cls, sender, **kwargs): diff --git a/src/paperless_tika/tests/test_signals.py b/src/paperless_tika/tests/test_signals.py index 1d5e0975a..aef3306f8 100644 --- a/src/paperless_tika/tests/test_signals.py +++ b/src/paperless_tika/tests/test_signals.py @@ -12,9 +12,9 @@ def test_test_handles_various_file_names_true(self): "A document with a . in it", "Doc with -- in it" ) suffixes = ( - "ods", "odt", "odp", - "ODS", "ODT", "ODP", - "oDs", "oDt", "oDp" + "ods", "odt", "odp", "xls", "xlsx", "doc", "docx", "ppt", "pptx", + "ODS", "ODT", "ODP", "XLS", "XLSX", "DOC", "DOCX", "PPT", "PPTX", + "oDs", "oDt", "oDp", "xLs", "xLsX", "dOc", "dOcX", "pPt", "pPtX", ) for prefix in prefixes: