feat: add ZIP_AUTO mode

This allows clients to not worry about which ZIP_32 or ZIP_64 mode they need, as long as they have the uncompressed size of the file. Most discussion is in #42
uktrade · Jun 3, 2023 · 0f292c9 · 0f292c9
1 parent 17a4f36
commit 0f292c9
Show file tree

Hide file tree

Showing 3 changed files with 147 additions and 15 deletions.
diff --git a/docs/getting-started.md b/docs/getting-started.md
@@ -101,7 +101,7 @@ This is a example of a ZIP showing all supported compression and metadata format
 
 ```python
 from datetime import datetime
-from stream_zip import ZIP_64, ZIP_32, NO_COMPRESSION_64, NO_COMPRESSION_32, stream_zip
+from stream_zip import ZIP_AUTO, ZIP_64, ZIP_32, NO_COMPRESSION_64, NO_COMPRESSION_32, stream_zip
 
 def member_files():
     modified_at = datetime.now()
@@ -111,30 +111,37 @@ def member_files():
         yield b'Some bytes 1'
 
     def file_2_data():
-        yield b'Some bytes 1'
         yield b'Some bytes 2'
 
     def file_3_data():
-        yield b'Some bytes 1'
         yield b'Some bytes 2'
         yield b'Some bytes 3'
-        yield b'Some bytes 4'
 
     def file_4_data():
+        yield b'Some bytes 4'
+        yield b'Some bytes 5'
+        yield b'Some bytes 6'
+        yield b'Some bytes 7'
+
+    def file_5_data():
         for i in range(5):
             yield bytes(f'Some bytes {i}', encoding="utf-8")
 
     # ZIP_64 mode
-    yield 'my-file-1.txt', modified_at, perms, ZIP_64, file_1_data()
+    yield 'my-file-1.txt', modified_at, perms, ZIP_64, file_2_data()
 
     # ZIP_32 mode
-    yield 'my-file-2.txt', modified_at, perms, ZIP_32, file_2_data()
+    yield 'my-file-2.txt', modified_at, perms, ZIP_32, file_3_data()
+
+    # ZIP_AUTO to choose between ZIP_32 and ZIP_64 automatically based on
+    # the uncompressed size of data
+    yield 'my-file-3.txt', modified_at, perms, ZIP_AUTO(uncompressed_size=12), file_1_data()
 
     # No compression for ZIP_64 files
-    yield 'my-file-3.txt', modified_at, perms, NO_COMPRESSION_64, file_3_data()
+    yield 'my-file-4.txt', modified_at, perms, NO_COMPRESSION_64, file_4_data()
 
     # No compression for ZIP_32 files
-    yield 'my-file-4.txt', modified_at, perms, NO_COMPRESSION_32, file_4_data()
+    yield 'my-file-5.txt', modified_at, perms, NO_COMPRESSION_32, file_5_data()
 
 for zipped_chunk in stream_zip(member_files()):
     print(zipped_chunk)

diff --git a/stream_zip.py b/stream_zip.py
@@ -7,17 +7,42 @@
 _ZIP_32 = object()
 _ZIP_64 = object()
 
+_AUTO_UPGRADE_CENTRAL_DIRECTORY = object()
+_NO_AUTO_UPGRADE_CENTRAL_DIRECTORY = object()
+
 def NO_COMPRESSION_32(offset, default_get_compressobj):
-    return _NO_COMPRESSION_32, default_get_compressobj
+    return _NO_COMPRESSION_32, _NO_AUTO_UPGRADE_CENTRAL_DIRECTORY, default_get_compressobj
 
 def NO_COMPRESSION_64(offset, default_get_compressobj):
-    return _NO_COMPRESSION_64, default_get_compressobj
+    return _NO_COMPRESSION_64, _NO_AUTO_UPGRADE_CENTRAL_DIRECTORY, default_get_compressobj
 
 def ZIP_32(offset, default_get_compressobj):
-    return _ZIP_32, default_get_compressobj
+    return _ZIP_32, _NO_AUTO_UPGRADE_CENTRAL_DIRECTORY, default_get_compressobj
 
 def ZIP_64(offset, default_get_compressobj):
-    return _ZIP_64, default_get_compressobj
+    return _ZIP_64, _NO_AUTO_UPGRADE_CENTRAL_DIRECTORY, default_get_compressobj
+
+def ZIP_AUTO(uncompressed_size, level=9):
+    def method_compressobj(offset, default_get_compressobj):
+        # The limit of 4293656841 is calculated using the logic from a zlib function
+        # https://github.com/madler/zlib/blob/04f42ceca40f73e2978b50e93806c2a18c1281fc/deflate.c#L696
+        # Specifically, worked out by assuming the compressed size of a stream cannot be bigger than
+        #
+        # uncompressed_size + (uncompressed_size >> 12) + (uncompressed_size >> 14) + (uncompressed_size >> 25) + 7
+        #
+        # This is the 0.03% deflate bound for default memLevel of 8 (and wbits), and it so we assume
+        # that it is a higher memLevel should never increase the size beyond this limit. In fact, higher memLevel
+        # should result in smaller sizes. This is backed up by experiments on random data.
+        #
+        # Also, Python's interaction with zlib is also not consistent between versions of Python
+        # https://stackoverflow.com/q/76371334/1319998
+        # so Python could be causing extra deflate-chunks output which could break the limit. However, since we
+        # enforce memLevel 9 which outputs smaller compressed data, this gives a margin of safety. There is
+        # probably a lower limit, but would need to be very sure how zlib works with memLevel 9, and Python's
+        # interaction with it
+        method = _ZIP_64 if uncompressed_size > 4293656841 or offset > 0xffffffff else _ZIP_32
+        return (method, _AUTO_UPGRADE_CENTRAL_DIRECTORY, lambda: zlib.compressobj(level=level, memLevel=9, wbits=-zlib.MAX_WBITS))
+    return method_compressobj
 
 
 def stream_zip(files, chunk_size=65536, get_compressobj=lambda: zlib.compressobj(wbits=-zlib.MAX_WBITS, level=9)):
@@ -353,7 +378,7 @@ def _chunks():
             return chunks, size, crc_32
 
         for name, modified_at, perms, method, chunks in files:
-            _method, _get_compress_obj = method(offset, get_compressobj)
+            _method, _auto_upgrade_central_directory, _get_compress_obj = method(offset, get_compressobj)
 
             name_encoded = name.encode('utf-8')
             _raise_if_beyond(len(name_encoded), maximum=0xffff, exception_class=NameLengthOverflowError)
@@ -377,7 +402,9 @@ def _chunks():
                 _no_compression_32_local_header_and_data
             central_directory.append((yield from data_func(name_encoded, mod_at_encoded, external_attr, _get_compress_obj, evenly_sized(chunks))))
 
-            zip_64_central_directory = zip_64_central_directory or _method in (_ZIP_64, _NO_COMPRESSION_64)
+            zip_64_central_directory = zip_64_central_directory \
+                or (_auto_upgrade_central_directory is _AUTO_UPGRADE_CENTRAL_DIRECTORY and offset > 0xffffffff) \
+                or _method in (_ZIP_64, _NO_COMPRESSION_64)
 
         max_central_directory_length, max_central_directory_start_offset, max_central_directory_size = \
             (0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff) if zip_64_central_directory else \

diff --git a/test_stream_zip.py b/test_stream_zip.py
@@ -8,12 +8,13 @@
 from zipfile import ZipFile
 
 import pytest
-from stream_unzip import stream_unzip
+from stream_unzip import UnsupportedZip64Error, stream_unzip
 
 from stream_zip import (
     stream_zip,
     NO_COMPRESSION_64,
     NO_COMPRESSION_32,
+    ZIP_AUTO,
     ZIP_64,
     ZIP_32,
     CompressedSizeOverflowError,
@@ -35,6 +36,14 @@ def cwd(new_dir):
         os.chdir(old_dir)
 
 
+def gen_bytes(num):
+    chunk = b'-' * 100000
+    while num:
+        to_yield = min(len(chunk), num)
+        num -= to_yield
+        yield chunk[:to_yield]
+
+
 def test_with_stream_unzip_zip_64():
     now = datetime.fromisoformat('2021-01-01 21:01:12')
     perms = 0o600
@@ -91,6 +100,95 @@ def files():
     ]
 
 
+def test_with_stream_unzip_auto_small():
+    now = datetime.fromisoformat('2021-01-01 21:01:12')
+    perms = 0o600
+
+    def files():
+        yield 'file-1', now, perms, ZIP_AUTO(20000), (b'a' * 10000, b'b' * 10000)
+        yield 'file-2', now, perms, ZIP_AUTO(2), (b'c', b'd')
+
+    assert [(b'file-1', None, b'a' * 10000 + b'b' * 10000), (b'file-2', None, b'cd')] == [
+        (name, size, b''.join(chunks))
+        for name, size, chunks in stream_unzip(stream_zip(files()), allow_zip64=False)
+    ]
+
+
+@pytest.mark.parametrize(
+    "level",
+    [
+        0,
+        9,
+    ],
+)
+def test_with_stream_unzip_at_zip_32_limit(level):
+    now = datetime.fromisoformat('2021-01-01 21:01:12')
+    perms = 0o600
+
+    def files():
+        yield 'file-1', now, perms, ZIP_AUTO(4293656841, level=level), gen_bytes(4293656841)
+
+    assert [(b'file-1', None, 4293656841)] == [
+        (name, size, sum(len(chunk) for chunk in chunks))
+        for name, size, chunks in stream_unzip(stream_zip(files()), allow_zip64=False)
+    ]
+
+
+@pytest.mark.parametrize(
+    "level",
+    [
+        0,
+        9,
+    ],
+)
+def test_with_stream_unzip_above_zip_32_size_limit(level):
+    now = datetime.fromisoformat('2021-01-01 21:01:12')
+    perms = 0o600
+
+    def files():
+        yield 'file-1', now, perms, ZIP_AUTO(4293656842, level=level), gen_bytes(4293656842)
+
+    assert [(b'file-1', None, 4293656842)] == [
+        (name, size, sum(len(chunk) for chunk in chunks))
+        for name, size, chunks in stream_unzip(stream_zip(files()))
+    ]
+
+    with pytest.raises(UnsupportedZip64Error):
+        next(iter(stream_unzip(stream_zip(files()), allow_zip64=False)))
+
+
+def test_with_stream_unzip_above_zip_32_offset_limit():
+    now = datetime.fromisoformat('2021-01-01 21:01:12')
+    perms = 0o600
+
+    def files():
+        yield 'file-1', now, perms, ZIP_AUTO(4000000000, level=0), gen_bytes(4000000000)
+        yield 'file-2', now, perms, ZIP_AUTO(4000000000, level=0), gen_bytes(4000000000)
+        yield 'file-3', now, perms, ZIP_AUTO(1, level=0), gen_bytes(1)
+
+    assert [(b'file-1', None, 4000000000), (b'file-2', None, 4000000000), (b'file-3', None, 1)] == [
+        (name, size, sum(len(chunk) for chunk in chunks))
+        for name, size, chunks in stream_unzip(stream_zip(files()))
+    ]
+
+    file_1_zip_32 = False
+    file_2_zip_32 = False
+    with pytest.raises(UnsupportedZip64Error):
+        it = iter(stream_unzip(stream_zip(files()), allow_zip64=False))
+        name, size, chunks = next(it)
+        for c in chunks:
+            pass
+        file_1_zip_32 = True
+        name, size, chunks = next(it)
+        for c in chunks:
+            pass
+        file_2_zip_32 = True
+        name, size, chunks = next(it)
+
+    assert file_1_zip_32
+    assert file_2_zip_32
+
+
 def test_with_stream_unzip_large_easily_compressible():
     now = datetime.fromisoformat('2021-01-01 21:01:12')
     perms = 0o600