Skip to content
This repository has been archived by the owner on Feb 19, 2021. It is now read-only.

Commit

Permalink
feat(): add ms-office formats
Browse files Browse the repository at this point in the history
  • Loading branch information
Tooa committed Jan 11, 2020
1 parent 34550de commit 06c73d4
Show file tree
Hide file tree
Showing 5 changed files with 24 additions and 11 deletions.
9 changes: 8 additions & 1 deletion src/documents/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -197,9 +197,16 @@ class Document(models.Model):
TYPE_ODS = "ods"
TYPE_ODT = "odt"
TYPE_ODP = "odp"
TYPE_XLS = "xls"
TYPE_XLSX = "xlsx"
TYPE_DOC = "doc"
TYPE_DOCX = "docx"
TYPE_PPT = "ppt"
TYPE_PPTX = "pptx"
TYPES = (TYPE_PDF, TYPE_PNG, TYPE_JPG, TYPE_GIF, TYPE_TIF,
TYPE_TXT, TYPE_CSV, TYPE_MD, TYPE_ODS, TYPE_ODT,
TYPE_ODP)
TYPE_ODP, TYPE_XLS, TYPE_XLSX, TYPE_DOC, TYPE_DOCX,
TYPE_PPT, TYPE_PPTX)

STORAGE_TYPE_UNENCRYPTED = "unencrypted"
STORAGE_TYPE_GPG = "gpg"
Expand Down
14 changes: 9 additions & 5 deletions src/documents/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,11 +54,15 @@ def render_to_response(self, context, **response_kwargs):
Document.TYPE_CSV: "text/csv",
Document.TYPE_MD: "text/markdown",
Document.TYPE_TXT: "text/plain",
Document.TYPE_ODS:
"application/vnd.oasis.opendocument.spreadsheet",
Document.TYPE_ODT: "application/vnd.oasis.opendocument.text",
Document.TYPE_ODP:
"application/vnd.oasis.opendocument.presentation"
Document.TYPE_ODS: "application/vnd.oasis.opendocument.spreadsheet", # NOQA: E501
Document.TYPE_ODT: "application/vnd.oasis.opendocument.text", # NOQA: E501
Document.TYPE_ODP: "application/vnd.oasis.opendocument.presentation", # NOQA: E501
Document.TYPE_DOC: "application/msword", # NOQA: E501
Document.TYPE_DOCX: "application/vnd.openxmlformats-officedocument.wordprocessingml.document", # NOQA: E501
Document.TYPE_XLS: "application/vnd.ms-excel", # NOQA: E501
Document.TYPE_XLSX: "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", # NOQA: E501
Document.TYPE_PPT: "application/vnd.ms-powerpoint", # NOQA: E501
Document.TYPE_PPTX: "application/vnd.openxmlformats-officedocument.presentationml.presentation", # NOQA: E501
}

if self.kwargs["kind"] == "thumb":
Expand Down
4 changes: 3 additions & 1 deletion src/paperless_tika/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,9 @@

class TikaDocumentParser(DocumentParser):
"""
This parser uses Apache Tika to try and get some text out of
This parser uses Apache-Tika to try and get some text out of office
formats, whether it's a open-office (ODS, ODT, ODP), or ms-office
format (XLS, XLSX, DOC, DOCX, PPT, PPTX)
"""

CONVERT = settings.CONVERT_BINARY
Expand Down
2 changes: 1 addition & 1 deletion src/paperless_tika/signals.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

class ConsumerDeclaration:

MATCHING_FILES = re.compile(r"^.*\.(ods|odt|odp)$")
MATCHING_FILES = re.compile(r"^.*\.(ods|odt|odp|xlsx?|docx?|pptx?)$")

@classmethod
def handle(cls, sender, **kwargs):
Expand Down
6 changes: 3 additions & 3 deletions src/paperless_tika/tests/test_signals.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,9 @@ def test_test_handles_various_file_names_true(self):
"A document with a . in it", "Doc with -- in it"
)
suffixes = (
"ods", "odt", "odp",
"ODS", "ODT", "ODP",
"oDs", "oDt", "oDp"
"ods", "odt", "odp", "xls", "xlsx", "doc", "docx", "ppt", "pptx",
"ODS", "ODT", "ODP", "XLS", "XLSX", "DOC", "DOCX", "PPT", "PPTX",
"oDs", "oDt", "oDp", "xLs", "xLsX", "dOc", "dOcX", "pPt", "pPtX",
)

for prefix in prefixes:
Expand Down

0 comments on commit 06c73d4

Please sign in to comment.