Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add an option to upgrade to zip64 for long zips #42

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
53 changes: 37 additions & 16 deletions stream_zip.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,17 @@
from struct import Struct
import zlib

NO_COMPRESSION_32 = object()
NO_COMPRESSION_64 = object()
ZIP_32 = object()
ZIP_64 = object()
ZIP_32 = 0b00
ZIP_64 = 0b01
NO_COMPRESSION = 0b10
NO_COMPRESSION_32 = NO_COMPRESSION | ZIP_32
NO_COMPRESSION_64 = NO_COMPRESSION | ZIP_64

def stream_zip(files, chunk_size=65536, get_compressobj=lambda: zlib.compressobj(wbits=-zlib.MAX_WBITS, level=9)):
_MAX_ZIP32_FILE_OFFSET = 0xffffffff
_MAX_ZIP64_FILE_OFFSET = 0xffffffffffffffff


def stream_zip(files, chunk_size=65536, get_compressobj=lambda: zlib.compressobj(wbits=-zlib.MAX_WBITS, level=9), allow_upgrade_to_64=False):

def evenly_sized(chunks):
chunk = b''
Expand Down Expand Up @@ -140,7 +145,7 @@ def _zip_64_local_header_and_data(name_encoded, mod_at_encoded, external_attr, c
def _zip_32_local_header_and_data(name_encoded, mod_at_encoded, external_attr, chunks):
file_offset = offset

_raise_if_beyond(file_offset, maximum=0xffffffff, exception_class=OffsetOverflowError)
_raise_if_beyond(file_offset, maximum=_MAX_ZIP32_FILE_OFFSET, exception_class=OffsetOverflowError)

yield from _(local_header_signature)
yield from _(local_header_struct.pack(
Expand Down Expand Up @@ -216,9 +221,9 @@ def _zip_data(chunks, max_uncompressed_size, max_compressed_size):
def _no_compression_64_local_header_and_data(name_encoded, mod_at_encoded, external_attr, chunks):
file_offset = offset

_raise_if_beyond(file_offset, maximum=0xffffffffffffffff, exception_class=OffsetOverflowError)
_raise_if_beyond(file_offset, maximum=_MAX_ZIP64_FILE_OFFSET, exception_class=OffsetOverflowError)

chunks, size, crc_32 = _no_compression_buffered_data_size_crc_32(chunks, maximum_size=0xffffffffffffffff)
chunks, size, crc_32 = _no_compression_buffered_data_size_crc_32(chunks, maximum_size=_MAX_ZIP64_FILE_OFFSET)

extra = zip_64_local_extra_struct.pack(
zip_64_extra_signature,
Expand Down Expand Up @@ -275,7 +280,7 @@ def _no_compression_64_local_header_and_data(name_encoded, mod_at_encoded, exter
def _no_compression_32_local_header_and_data(name_encoded, mod_at_encoded, external_attr, chunks):
file_offset = offset

_raise_if_beyond(file_offset, maximum=0xffffffff, exception_class=OffsetOverflowError)
_raise_if_beyond(file_offset, maximum=_MAX_ZIP32_FILE_OFFSET, exception_class=OffsetOverflowError)

chunks, size, crc_32 = _no_compression_buffered_data_size_crc_32(chunks, maximum_size=0xffffffff)

Expand Down Expand Up @@ -338,8 +343,6 @@ def _chunks():
return chunks, size, crc_32

for name, modified_at, perms, method, chunks in files:
zip_64_central_directory = zip_64_central_directory or method in (ZIP_64, NO_COMPRESSION_64)

name_encoded = name.encode('utf-8')
_raise_if_beyond(len(name_encoded), maximum=0xffff, exception_class=NameLengthOverflowError)

Expand All @@ -355,20 +358,32 @@ def _chunks():
(perms << 16) | \
(0x10 if name_encoded[-1:] == b'/' else 0x0) # MS-DOS directory

if allow_upgrade_to_64 and offset > _MAX_ZIP32_FILE_OFFSET:
# Force zip64 for this file - we can't write files with this offset in zip32
method |= ZIP_64


zip_64_central_directory = zip_64_central_directory or method & ZIP_64

data_func = \
_zip_64_local_header_and_data if method is ZIP_64 else \
_zip_32_local_header_and_data if method is ZIP_32 else \
_no_compression_64_local_header_and_data if method is NO_COMPRESSION_64 else \
_no_compression_32_local_header_and_data
central_directory.append((yield from data_func(name_encoded, mod_at_encoded, external_attr, evenly_sized(chunks))))

central_directory_start_offset = offset
if allow_upgrade_to_64 and central_directory_start_offset > _MAX_ZIP32_FILE_OFFSET:
# Even if all files written are zip32, we need to write a zip64 central directory if the total offset is >4GiB
zip_64_central_directory = True

max_central_directory_length, max_central_directory_start_offset, max_central_directory_size = \
(0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff) if zip_64_central_directory else \
(0xffff, 0xffffffff, 0xffffffff)
(_MAX_ZIP64_FILE_OFFSET, _MAX_ZIP64_FILE_OFFSET, _MAX_ZIP64_FILE_OFFSET) if zip_64_central_directory else \
(0xffff, _MAX_ZIP32_FILE_OFFSET, _MAX_ZIP32_FILE_OFFSET)

central_directory_start_offset = offset
central_directory_end_offset = offset
central_directory_size = central_directory_end_offset - central_directory_start_offset

_raise_if_beyond(central_directory_start_offset, maximum=max_central_directory_start_offset, exception_class=OffsetOverflowError)
_raise_if_beyond(len(central_directory), maximum=max_central_directory_length, exception_class=CentralDirectoryNumberOfEntriesOverflowError)

Expand All @@ -381,8 +396,14 @@ def _chunks():
central_directory_end_offset = offset
central_directory_size = central_directory_end_offset - central_directory_start_offset

_raise_if_beyond(central_directory_end_offset, maximum=0xffffffffffffffff, exception_class=OffsetOverflowError)
_raise_if_beyond(central_directory_size, maximum=max_central_directory_size, exception_class=CentralDirectorySizeOverflowError)
zip_64_end_of_central_directory = zip_64_central_directory
if allow_upgrade_to_64 and central_directory_end_offset > _MAX_ZIP32_FILE_OFFSET:
# Even if the all files written are zip32 and the central directory is zip32, we need to write a zip64 EOCD record if the total offset now is >4GiB
zip_64_end_of_central_directory = True

max_central_directory_end_offset = _MAX_ZIP64_FILE_OFFSET if zip_64_end_of_central_directory else _MAX_ZIP32_FILE_OFFSET
_raise_if_beyond(central_directory_size, maximum=max_central_directory_size, exception_class=CentralDirectorySizeOverflowError)
_raise_if_beyond(central_directory_end_offset, maximum=max_central_directory_end_offset, exception_class=OffsetOverflowError)

if zip_64_central_directory:
yield from _(zip_64_end_of_central_directory_signature)
Expand Down
120 changes: 102 additions & 18 deletions test_stream_zip.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from datetime import datetime
from io import BytesIO
import contextlib
import itertools
import os
import stat
import subprocess
Expand Down Expand Up @@ -35,6 +36,28 @@ def cwd(new_dir):
os.chdir(old_dir)



def consume_stream(streamed_zip):
"""
Helper - consumes generators in a streamed zip.
Returns the total length of the zip file.
"""
return sum((len(chunk) for chunk in streamed_zip), 0)


def consume_stream_unzip(streamed_zip) -> int:
"""
Helper - consumes generators in a streamed zip, passing through stream_unzip to
ensure the stream is valid.
Returns the total length of the zip file.
"""
num_received = 0
for name, size, chunks in stream_unzip(streamed_zip):
for chunk in chunks:
num_received += len(chunk)
return num_received


def test_with_stream_unzip_zip_64():
now = datetime.fromisoformat('2021-01-01 21:01:12')
perms = 0o600
Expand Down Expand Up @@ -103,14 +126,60 @@ def data():

yield 'file-1', now, perms, ZIP_64, data()

num_received = 0
for name, size, chunks in stream_unzip(stream_zip(files())):
for chunk in chunks:
num_received += len(chunk)

num_received = consume_stream_unzip(stream_zip(files()))
assert num_received == 5000000000


def test_zip64_central_directory_after_only_zip32_files():
now = datetime.fromisoformat("2021-01-01 21:01:12")
perms = 0o600

def files():
for i in range(4):
# 1.1GB of data per file
data = itertools.repeat(b"0" * 1000000, 1100)
yield f"file-{i}", now, perms, NO_COMPRESSION_32, data

# now file offset is >4GiB, but so far no overflow has been thrown.
return

# Using allow_upgrade_to_64=False (default) throws an overflow error *after* writing all the files, because
# the zip32 central directory can't be written.
with pytest.raises(OffsetOverflowError):
consume_stream(stream_zip(files()))

# Doing the same with it set to True just generates a zip64 header instead.
consume_stream_unzip(stream_zip(files(), allow_upgrade_to_64=True))


def test_zip64_end_of_central_directory_after_zip32_central_directory():
now = datetime.fromisoformat("2021-01-01 21:01:12")
perms = 0o600

def files():
def data():
somebytes = b"0" * 429496
yield from itertools.repeat(somebytes, 10000)
# there's 7295 bytes remaining until we hit 4GiB, use up almost all of them
yield b"0" * 7290

yield "file-1", now, perms, NO_COMPRESSION_32, data()

# here, file offset is still JUST <4GiB, but writing the central directory will put it over the limit.
# To be clear, writing the zip32 central directory *works fine* - but the *end* of central directory record
# cannot be written, since it must contain a 32-bit pointer to an offset over 4GiB.
return

# Using allow_upgrade_to_64=False (default) throws an overflow error *after* writing all the files, because
# the zip32 end of central directory record can't be written.
with pytest.raises(OffsetOverflowError):
consume_stream(stream_zip(files()))

# Doing the same with it set to True just generates a zip64 end of central directory header instead.
consume_stream_unzip(stream_zip(files(), allow_upgrade_to_64=True))



def test_with_stream_unzip_large_not_easily_compressible_with_no_compression_64():
now = datetime.fromisoformat('2021-01-01 21:01:12')
perms = 0o600
Expand All @@ -124,11 +193,7 @@ def data():
yield 'file-1', now, perms, ZIP_64, data()
yield 'file-2', now, perms, NO_COMPRESSION_64, (b'-',)

num_received = 0
for name, size, chunks in stream_unzip(stream_zip(files())):
for chunk in chunks:
num_received += len(chunk)

num_received = consume_stream_unzip(stream_zip(files()))
assert num_received == 5000000001


Expand All @@ -146,9 +211,31 @@ def data():
yield 'file-2', now, perms, NO_COMPRESSION_32, (b'-',)

with pytest.raises(OffsetOverflowError):
for name, size, chunks in stream_unzip(stream_zip(files())):
for chunk in chunks:
pass
consume_stream(stream_zip(files()))


def test_zip32_file_after_long_offset_upgraded_to_zip64():
now = datetime.fromisoformat('2021-01-01 21:01:12')
perms = 0o600
batch = b'0' * 500000

def files():
def data():
for i in range(0, 10000):
yield batch

# write 5GB of data
yield 'file-1', now, perms, NO_COMPRESSION_64, data()
# now file offset is >4GiB, so writing the next file as ZIP32 would fail.
yield 'file-2', now, perms, NO_COMPRESSION_32, (b'-',)

# Using allow_upgrade_to_64=True automatically upgrades the file to be written as ZIP64
# due to the file offset being too large.
consume_stream_unzip(stream_zip(files(), allow_upgrade_to_64=True))

# Doing the same with it set to False (default) causes an overflow error instead:
with pytest.raises(OffsetOverflowError):
consume_stream(stream_zip(files(), allow_upgrade_to_64=False))


def test_with_stream_unzip_large_not_easily_compressible_with_zip_32():
Expand All @@ -165,9 +252,7 @@ def data():
yield 'file-2', now, perms, ZIP_32, (b'-',) # Needs a ZIP_64 offset, but is in ZIP_32 mode

with pytest.raises(OffsetOverflowError):
for name, size, chunks in stream_unzip(stream_zip(files())):
for chunk in chunks:
pass
consume_stream(stream_zip(files()))


def test_zip_overflow_large_not_easily_compressible():
Expand All @@ -183,8 +268,7 @@ def data():
yield 'file-1', now, perms, ZIP_32, data()

with pytest.raises(CompressedSizeOverflowError):
for chunk in stream_zip(files()):
pass
consume_stream(stream_zip(files()))


def test_zip_overflow_large_easily_compressible():
Expand Down