uktrade · craigds · May 25, 2023 · May 26, 2023
diff --git a/stream_zip.py b/stream_zip.py
@@ -2,12 +2,17 @@
 from struct import Struct
 import zlib
 
-NO_COMPRESSION_32 = object()
-NO_COMPRESSION_64 = object()
-ZIP_32 = object()
-ZIP_64 = object()
+ZIP_32 = 0b00
+ZIP_64 = 0b01
+NO_COMPRESSION = 0b10
+NO_COMPRESSION_32 = NO_COMPRESSION | ZIP_32
+NO_COMPRESSION_64 = NO_COMPRESSION | ZIP_64
 
-def stream_zip(files, chunk_size=65536, get_compressobj=lambda: zlib.compressobj(wbits=-zlib.MAX_WBITS, level=9)):
+_MAX_ZIP32_FILE_OFFSET = 0xffffffff
+_MAX_ZIP64_FILE_OFFSET = 0xffffffffffffffff
+
+
+def stream_zip(files, chunk_size=65536, get_compressobj=lambda: zlib.compressobj(wbits=-zlib.MAX_WBITS, level=9), allow_upgrade_to_64=False):
 
     def evenly_sized(chunks):
         chunk = b''
@@ -140,7 +145,7 @@ def _zip_64_local_header_and_data(name_encoded, mod_at_encoded, external_attr, c
         def _zip_32_local_header_and_data(name_encoded, mod_at_encoded, external_attr, chunks):
             file_offset = offset
 
-            _raise_if_beyond(file_offset, maximum=0xffffffff, exception_class=OffsetOverflowError)
+            _raise_if_beyond(file_offset, maximum=_MAX_ZIP32_FILE_OFFSET, exception_class=OffsetOverflowError)
 
             yield from _(local_header_signature)
             yield from _(local_header_struct.pack(
@@ -216,9 +221,9 @@ def _zip_data(chunks, max_uncompressed_size, max_compressed_size):
         def _no_compression_64_local_header_and_data(name_encoded, mod_at_encoded, external_attr, chunks):
             file_offset = offset
 
-            _raise_if_beyond(file_offset, maximum=0xffffffffffffffff, exception_class=OffsetOverflowError)
+            _raise_if_beyond(file_offset, maximum=_MAX_ZIP64_FILE_OFFSET, exception_class=OffsetOverflowError)
 
-            chunks, size, crc_32 = _no_compression_buffered_data_size_crc_32(chunks, maximum_size=0xffffffffffffffff)
+            chunks, size, crc_32 = _no_compression_buffered_data_size_crc_32(chunks, maximum_size=_MAX_ZIP64_FILE_OFFSET)
 
             extra = zip_64_local_extra_struct.pack(
                 zip_64_extra_signature,
@@ -275,7 +280,7 @@ def _no_compression_64_local_header_and_data(name_encoded, mod_at_encoded, exter
         def _no_compression_32_local_header_and_data(name_encoded, mod_at_encoded, external_attr, chunks):
             file_offset = offset
 
-            _raise_if_beyond(file_offset, maximum=0xffffffff, exception_class=OffsetOverflowError)
+            _raise_if_beyond(file_offset, maximum=_MAX_ZIP32_FILE_OFFSET, exception_class=OffsetOverflowError)
 
             chunks, size, crc_32 = _no_compression_buffered_data_size_crc_32(chunks, maximum_size=0xffffffff)
 
@@ -338,8 +343,6 @@ def _chunks():
             return chunks, size, crc_32
 
         for name, modified_at, perms, method, chunks in files:
-            zip_64_central_directory = zip_64_central_directory or method in (ZIP_64, NO_COMPRESSION_64)
-
             name_encoded = name.encode('utf-8')
             _raise_if_beyond(len(name_encoded), maximum=0xffff, exception_class=NameLengthOverflowError)
 
@@ -355,20 +358,32 @@ def _chunks():
                 (perms << 16) | \
                 (0x10 if name_encoded[-1:] == b'/' else 0x0)  # MS-DOS directory
 
+            if allow_upgrade_to_64 and offset > _MAX_ZIP32_FILE_OFFSET:
+                # Force zip64 for this file - we can't write files with this offset in zip32
+                method |= ZIP_64
+
+
+            zip_64_central_directory = zip_64_central_directory or method & ZIP_64
+
             data_func = \
                 _zip_64_local_header_and_data if method is ZIP_64 else \
                 _zip_32_local_header_and_data if method is ZIP_32 else \
                 _no_compression_64_local_header_and_data if method is NO_COMPRESSION_64 else \
                 _no_compression_32_local_header_and_data
             central_directory.append((yield from data_func(name_encoded, mod_at_encoded, external_attr, evenly_sized(chunks))))
 
+        central_directory_start_offset = offset
+        if allow_upgrade_to_64 and central_directory_start_offset > _MAX_ZIP32_FILE_OFFSET:
+            # Even if all files written are zip32, we need to write a zip64 central directory if the total offset is >4GiB
+            zip_64_central_directory = True
+
         max_central_directory_length, max_central_directory_start_offset, max_central_directory_size = \
-            (0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff) if zip_64_central_directory else \
-            (0xffff, 0xffffffff, 0xffffffff)
+            (_MAX_ZIP64_FILE_OFFSET, _MAX_ZIP64_FILE_OFFSET, _MAX_ZIP64_FILE_OFFSET) if zip_64_central_directory else \
+            (0xffff, _MAX_ZIP32_FILE_OFFSET, _MAX_ZIP32_FILE_OFFSET)
 
-        central_directory_start_offset = offset
         central_directory_end_offset = offset
         central_directory_size = central_directory_end_offset - central_directory_start_offset
+
         _raise_if_beyond(central_directory_start_offset, maximum=max_central_directory_start_offset, exception_class=OffsetOverflowError)
         _raise_if_beyond(len(central_directory), maximum=max_central_directory_length, exception_class=CentralDirectoryNumberOfEntriesOverflowError)
 
@@ -381,8 +396,14 @@ def _chunks():
             central_directory_end_offset = offset
             central_directory_size = central_directory_end_offset - central_directory_start_offset
 
-            _raise_if_beyond(central_directory_end_offset, maximum=0xffffffffffffffff, exception_class=OffsetOverflowError)
-            _raise_if_beyond(central_directory_size, maximum=max_central_directory_size, exception_class=CentralDirectorySizeOverflowError)
+        zip_64_end_of_central_directory = zip_64_central_directory
+        if allow_upgrade_to_64 and central_directory_end_offset > _MAX_ZIP32_FILE_OFFSET:
+            # Even if the all files written are zip32 and the central directory is zip32, we need to write a zip64 EOCD record if the total offset now is >4GiB
+            zip_64_end_of_central_directory = True
+
+        max_central_directory_end_offset = _MAX_ZIP64_FILE_OFFSET if zip_64_end_of_central_directory else _MAX_ZIP32_FILE_OFFSET
+        _raise_if_beyond(central_directory_size, maximum=max_central_directory_size, exception_class=CentralDirectorySizeOverflowError)
+        _raise_if_beyond(central_directory_end_offset, maximum=max_central_directory_end_offset, exception_class=OffsetOverflowError)
 
         if zip_64_central_directory:
             yield from _(zip_64_end_of_central_directory_signature)

diff --git a/test_stream_zip.py b/test_stream_zip.py
@@ -1,6 +1,7 @@
 from datetime import datetime
 from io import BytesIO
 import contextlib
+import itertools
 import os
 import stat
 import subprocess
@@ -35,6 +36,28 @@ def cwd(new_dir):
         os.chdir(old_dir)
 
 
+
+def consume_stream(streamed_zip):
+    """
+    Helper - consumes generators in a streamed zip.
+    Returns the total length of the zip file.
+    """
+    return sum((len(chunk) for chunk in streamed_zip), 0)
+
+
+def consume_stream_unzip(streamed_zip) -> int:
+    """
+    Helper - consumes generators in a streamed zip, passing through stream_unzip to
+    ensure the stream is valid.
+    Returns the total length of the zip file.
+    """
+    num_received = 0
+    for name, size, chunks in stream_unzip(streamed_zip):
+        for chunk in chunks:
+            num_received += len(chunk)
+    return num_received
+
+
 def test_with_stream_unzip_zip_64():
     now = datetime.fromisoformat('2021-01-01 21:01:12')
     perms = 0o600
@@ -103,14 +126,60 @@ def data():
 
         yield 'file-1', now, perms, ZIP_64, data()
 
-    num_received = 0
-    for name, size, chunks in stream_unzip(stream_zip(files())):
-        for chunk in chunks:
-            num_received += len(chunk)
-
+    num_received = consume_stream_unzip(stream_zip(files()))
     assert num_received == 5000000000
 
 
+def test_zip64_central_directory_after_only_zip32_files():
+    now = datetime.fromisoformat("2021-01-01 21:01:12")
+    perms = 0o600
+
+    def files():
+        for i in range(4):
+            # 1.1GB of data per file
+            data = itertools.repeat(b"0" * 1000000, 1100)
+            yield f"file-{i}", now, perms, NO_COMPRESSION_32, data
+
+        # now file offset is >4GiB, but so far no overflow has been thrown.
+        return
+
+    # Using allow_upgrade_to_64=False (default) throws an overflow error *after* writing all the files, because
+    # the zip32 central directory can't be written.
+    with pytest.raises(OffsetOverflowError):
+        consume_stream(stream_zip(files()))
+
+    # Doing the same with it set to True just generates a zip64 header instead.
+    consume_stream_unzip(stream_zip(files(), allow_upgrade_to_64=True))
+
+
+def test_zip64_end_of_central_directory_after_zip32_central_directory():
+    now = datetime.fromisoformat("2021-01-01 21:01:12")
+    perms = 0o600
+
+    def files():
+        def data():
+            somebytes = b"0" * 429496
+            yield from itertools.repeat(somebytes, 10000)
+            # there's 7295 bytes remaining until we hit 4GiB, use up almost all of them
+            yield b"0" * 7290
+
+        yield "file-1", now, perms, NO_COMPRESSION_32, data()
+
+        # here, file offset is still JUST <4GiB, but writing the central directory will put it over the limit.
+        # To be clear, writing the zip32 central directory *works fine* - but the *end* of central directory record
+        # cannot be written, since it must contain a 32-bit pointer to an offset over 4GiB.
+        return
+
+    # Using allow_upgrade_to_64=False (default) throws an overflow error *after* writing all the files, because
+    # the zip32 end of central directory record can't be written.
+    with pytest.raises(OffsetOverflowError):
+        consume_stream(stream_zip(files()))
+
+    # Doing the same with it set to True just generates a zip64 end of central directory header instead.
+    consume_stream_unzip(stream_zip(files(), allow_upgrade_to_64=True))
+
+
+
 def test_with_stream_unzip_large_not_easily_compressible_with_no_compression_64():
     now = datetime.fromisoformat('2021-01-01 21:01:12')
     perms = 0o600
@@ -124,11 +193,7 @@ def data():
         yield 'file-1', now, perms, ZIP_64, data()
         yield 'file-2', now, perms, NO_COMPRESSION_64, (b'-',)
 
-    num_received = 0
-    for name, size, chunks in stream_unzip(stream_zip(files())):
-        for chunk in chunks:
-            num_received += len(chunk)
-
+    num_received = consume_stream_unzip(stream_zip(files()))
     assert num_received == 5000000001
 
 
@@ -146,9 +211,31 @@ def data():
         yield 'file-2', now, perms, NO_COMPRESSION_32, (b'-',)
 
     with pytest.raises(OffsetOverflowError):
-        for name, size, chunks in stream_unzip(stream_zip(files())):
-            for chunk in chunks:
-                pass
+        consume_stream(stream_zip(files()))
+
+
+def test_zip32_file_after_long_offset_upgraded_to_zip64():
+    now = datetime.fromisoformat('2021-01-01 21:01:12')
+    perms = 0o600
+    batch = b'0' * 500000
+
+    def files():
+        def data():
+            for i in range(0, 10000):
+                yield batch
+
+        # write 5GB of data
+        yield 'file-1', now, perms, NO_COMPRESSION_64, data()
+        # now file offset is >4GiB, so writing the next file as ZIP32 would fail.
+        yield 'file-2', now, perms, NO_COMPRESSION_32, (b'-',)
+
+    # Using allow_upgrade_to_64=True automatically upgrades the file to be written as ZIP64
+    # due to the file offset being too large.
+    consume_stream_unzip(stream_zip(files(), allow_upgrade_to_64=True))
+
+    # Doing the same with it set to False (default) causes an overflow error instead:
+    with pytest.raises(OffsetOverflowError):
+        consume_stream(stream_zip(files(), allow_upgrade_to_64=False))
 
 
 def test_with_stream_unzip_large_not_easily_compressible_with_zip_32():
@@ -165,9 +252,7 @@ def data():
         yield 'file-2', now, perms, ZIP_32, (b'-',)  # Needs a ZIP_64 offset, but is in ZIP_32 mode
 
     with pytest.raises(OffsetOverflowError):
-        for name, size, chunks in stream_unzip(stream_zip(files())):
-            for chunk in chunks:
-                pass
+        consume_stream(stream_zip(files()))
 
 
 def test_zip_overflow_large_not_easily_compressible():
@@ -183,8 +268,7 @@ def data():
         yield 'file-1', now, perms, ZIP_32, data()
 
     with pytest.raises(CompressedSizeOverflowError):
-        for chunk in stream_zip(files()):
-            pass
+        consume_stream(stream_zip(files()))
 
 
 def test_zip_overflow_large_easily_compressible():