Skip to content
Permalink
Browse files Browse the repository at this point in the history
Don't load images / documents into memory when calculating their hash
This can be done in chunks, which prevents crashes when uploading large files.
  • Loading branch information
RealOrangeOne authored and gasman committed Apr 3, 2023
1 parent eefc338 commit 3c0c646
Show file tree
Hide file tree
Showing 6 changed files with 124 additions and 13 deletions.
13 changes: 6 additions & 7 deletions wagtail/documents/models.py
@@ -1,4 +1,3 @@
import hashlib
import os.path
import urllib
from contextlib import contextmanager
Expand All @@ -15,6 +14,7 @@
from wagtail.models import CollectionMember, ReferenceIndex
from wagtail.search import index
from wagtail.search.queryset import SearchableQuerySetMixin
from wagtail.utils.file import hash_filelike


class DocumentQuerySet(SearchableQuerySetMixin, models.QuerySet):
Expand Down Expand Up @@ -122,14 +122,13 @@ def get_file_size(self):

return self.file_size

def _set_file_hash(self, file_contents):
self.file_hash = hashlib.sha1(file_contents).hexdigest()
def _set_file_hash(self):
with self.open_file() as f:
self.file_hash = hash_filelike(f)

def get_file_hash(self):
if self.file_hash == "":
with self.open_file() as f:
self._set_file_hash(f.read())

self._set_file_hash()
self.save(update_fields=["file_hash"])

return self.file_hash
Expand All @@ -141,7 +140,7 @@ def _set_document_file_metadata(self):
self.file_size = self.file.size

# Set new document file hash
self._set_file_hash(self.file.read())
self._set_file_hash()
self.file.seek(0)

def __str__(self):
Expand Down
13 changes: 13 additions & 0 deletions wagtail/documents/tests/test_models.py
Expand Up @@ -126,6 +126,19 @@ def test_content_type(self):
"application/octet-stream", self.extensionless_document.content_type
)

def test_file_hash(self):
self.assertEqual(
self.document.get_file_hash(), "7d8c4778b182e4f3bd442408c64a6e22a4b0ed85"
)
self.assertEqual(
self.pdf_document.get_file_hash(),
"7d8c4778b182e4f3bd442408c64a6e22a4b0ed85",
)
self.assertEqual(
self.extensionless_document.get_file_hash(),
"7d8c4778b182e4f3bd442408c64a6e22a4b0ed85",
)

def test_content_disposition(self):
self.assertEqual(
"""attachment; filename=example.doc; filename*=UTF-8''example.doc""",
Expand Down
12 changes: 6 additions & 6 deletions wagtail/images/models.py
Expand Up @@ -38,6 +38,7 @@
from wagtail.models import CollectionMember, ReferenceIndex
from wagtail.search import index
from wagtail.search.queryset import SearchableQuerySetMixin
from wagtail.utils.file import hash_filelike

logger = logging.getLogger("wagtail.images")

Expand Down Expand Up @@ -267,14 +268,13 @@ class AbstractImage(ImageFileMixin, CollectionMember, index.Indexed, models.Mode

objects = ImageQuerySet.as_manager()

def _set_file_hash(self, file_contents):
self.file_hash = hashlib.sha1(file_contents).hexdigest()
def _set_file_hash(self):
with self.open_file() as f:
self.file_hash = hash_filelike(f)

def get_file_hash(self):
if self.file_hash == "":
with self.open_file() as f:
self._set_file_hash(f.read())

self._set_file_hash()
self.save(update_fields=["file_hash"])

return self.file_hash
Expand All @@ -286,7 +286,7 @@ def _set_image_file_metadata(self):
self.file_size = self.file.size

# Set new image file hash
self._set_file_hash(self.file.read())
self._set_file_hash()
self.file.seek(0)

def get_upload_to(self, filename):
Expand Down
5 changes: 5 additions & 0 deletions wagtail/images/tests/test_models.py
Expand Up @@ -112,6 +112,11 @@ def test_get_file_size_on_missing_file_raises_sourceimageioerror(self):
with self.assertRaises(SourceImageIOError):
self.image.get_file_size()

def test_file_hash(self):
self.assertEqual(
self.image.get_file_hash(), "4dd0211870e130b7e1690d2ec53c499a54a48fef"
)


class TestImageQuerySet(TestCase):
def test_search_method(self):
Expand Down
59 changes: 59 additions & 0 deletions wagtail/tests/test_utils.py
@@ -1,8 +1,11 @@
# -*- coding: utf-8 -*
import pickle
from io import BytesIO, StringIO
from pathlib import Path

from django.contrib.contenttypes.models import ContentType
from django.core.exceptions import ImproperlyConfigured, SuspiciousOperation
from django.core.files.uploadedfile import SimpleUploadedFile
from django.test import SimpleTestCase, TestCase, override_settings
from django.utils.text import slugify
from django.utils.translation import _trans
Expand All @@ -23,6 +26,7 @@
string_to_ascii,
)
from wagtail.models import Page, Site
from wagtail.utils.file import hash_filelike
from wagtail.utils.utils import deep_update


Expand Down Expand Up @@ -508,3 +512,58 @@ def test_deep_update(self):
"starship": "enterprise",
},
)


class HashFileLikeTestCase(SimpleTestCase):
test_file = Path.cwd() / "LICENSE"

def test_hashes_io(self):
self.assertEqual(
hash_filelike(BytesIO(b"test")), "a94a8fe5ccb19ba61c4c0873d391e987982fbbd3"
)
self.assertEqual(
hash_filelike(StringIO("test")), "a94a8fe5ccb19ba61c4c0873d391e987982fbbd3"
)

def test_hashes_file(self):
with self.test_file.open(mode="r") as f:
self.assertEqual(
hash_filelike(f), "9e58400061ca660ef7b5c94338a5205627c77eda"
)

def test_hashes_file_bytes(self):
with self.test_file.open(mode="rb") as f:
self.assertEqual(
hash_filelike(f), "9e58400061ca660ef7b5c94338a5205627c77eda"
)

def test_hashes_django_uploaded_file(self):
"""
Check Django's file shims can be hashed as-is.
`SimpleUploadedFile` inherits the base `UploadedFile`, but is easiest to test against
"""
self.assertEqual(
hash_filelike(SimpleUploadedFile("example.txt", b"test")),
"a94a8fe5ccb19ba61c4c0873d391e987982fbbd3",
)

def test_hashes_large_file(self):
class FakeLargeFile:
"""
A class that pretends to be a huge file (~1.3GB)
"""

def __init__(self):
self.iterations = 20000

def read(self, bytes):
self.iterations -= 1
if not self.iterations:
return b""

return b"A" * bytes

self.assertEqual(
hash_filelike(FakeLargeFile()),
"187cc1db32624dccace20d042f6d631f1a483020",
)
35 changes: 35 additions & 0 deletions wagtail/utils/file.py
@@ -0,0 +1,35 @@
from hashlib import sha1
from io import UnsupportedOperation

from django.utils.encoding import force_bytes

HASH_READ_SIZE = 65536 # 64k


def hash_filelike(filelike):
"""
Compute the hash of a file-like object, without loading it all into memory.
"""
file_pos = 0
if hasattr(filelike, "tell"):
file_pos = filelike.tell()

try:
# Reset file handler to the start of the file so we hash it all
filelike.seek(0)
except (AttributeError, UnsupportedOperation):
pass

hasher = sha1()
while True:
data = filelike.read(HASH_READ_SIZE)
if not data:
break
# Use `force_bytes` to account for files opened as text
hasher.update(force_bytes(data))

if hasattr(filelike, "seek"):
# Reset the file handler to where it was before
filelike.seek(file_pos)

return hasher.hexdigest()

0 comments on commit 3c0c646

Please sign in to comment.