Skip to content

Commit

Permalink
feat: add ZIP_AUTO mode
Browse files Browse the repository at this point in the history
This allows clients to not worry about which ZIP_32 or ZIP_64 mode they need,
as long as they have the uncompressed size of the file.

Most discussion is in #42
  • Loading branch information
michalc committed Jun 3, 2023
1 parent 17a4f36 commit 0f292c9
Show file tree
Hide file tree
Showing 3 changed files with 147 additions and 15 deletions.
23 changes: 15 additions & 8 deletions docs/getting-started.md
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ This is a example of a ZIP showing all supported compression and metadata format

```python
from datetime import datetime
from stream_zip import ZIP_64, ZIP_32, NO_COMPRESSION_64, NO_COMPRESSION_32, stream_zip
from stream_zip import ZIP_AUTO, ZIP_64, ZIP_32, NO_COMPRESSION_64, NO_COMPRESSION_32, stream_zip

def member_files():
modified_at = datetime.now()
Expand All @@ -111,30 +111,37 @@ def member_files():
yield b'Some bytes 1'

def file_2_data():
yield b'Some bytes 1'
yield b'Some bytes 2'

def file_3_data():
yield b'Some bytes 1'
yield b'Some bytes 2'
yield b'Some bytes 3'
yield b'Some bytes 4'

def file_4_data():
yield b'Some bytes 4'
yield b'Some bytes 5'
yield b'Some bytes 6'
yield b'Some bytes 7'

def file_5_data():
for i in range(5):
yield bytes(f'Some bytes {i}', encoding="utf-8")

# ZIP_64 mode
yield 'my-file-1.txt', modified_at, perms, ZIP_64, file_1_data()
yield 'my-file-1.txt', modified_at, perms, ZIP_64, file_2_data()

# ZIP_32 mode
yield 'my-file-2.txt', modified_at, perms, ZIP_32, file_2_data()
yield 'my-file-2.txt', modified_at, perms, ZIP_32, file_3_data()

# ZIP_AUTO to choose between ZIP_32 and ZIP_64 automatically based on
# the uncompressed size of data
yield 'my-file-3.txt', modified_at, perms, ZIP_AUTO(uncompressed_size=12), file_1_data()

# No compression for ZIP_64 files
yield 'my-file-3.txt', modified_at, perms, NO_COMPRESSION_64, file_3_data()
yield 'my-file-4.txt', modified_at, perms, NO_COMPRESSION_64, file_4_data()

# No compression for ZIP_32 files
yield 'my-file-4.txt', modified_at, perms, NO_COMPRESSION_32, file_4_data()
yield 'my-file-5.txt', modified_at, perms, NO_COMPRESSION_32, file_5_data()

for zipped_chunk in stream_zip(member_files()):
print(zipped_chunk)
Expand Down
39 changes: 33 additions & 6 deletions stream_zip.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,17 +7,42 @@
_ZIP_32 = object()
_ZIP_64 = object()

_AUTO_UPGRADE_CENTRAL_DIRECTORY = object()
_NO_AUTO_UPGRADE_CENTRAL_DIRECTORY = object()

def NO_COMPRESSION_32(offset, default_get_compressobj):
return _NO_COMPRESSION_32, default_get_compressobj
return _NO_COMPRESSION_32, _NO_AUTO_UPGRADE_CENTRAL_DIRECTORY, default_get_compressobj

def NO_COMPRESSION_64(offset, default_get_compressobj):
return _NO_COMPRESSION_64, default_get_compressobj
return _NO_COMPRESSION_64, _NO_AUTO_UPGRADE_CENTRAL_DIRECTORY, default_get_compressobj

def ZIP_32(offset, default_get_compressobj):
return _ZIP_32, default_get_compressobj
return _ZIP_32, _NO_AUTO_UPGRADE_CENTRAL_DIRECTORY, default_get_compressobj

def ZIP_64(offset, default_get_compressobj):
return _ZIP_64, default_get_compressobj
return _ZIP_64, _NO_AUTO_UPGRADE_CENTRAL_DIRECTORY, default_get_compressobj

def ZIP_AUTO(uncompressed_size, level=9):
def method_compressobj(offset, default_get_compressobj):
# The limit of 4293656841 is calculated using the logic from a zlib function
# https://github.com/madler/zlib/blob/04f42ceca40f73e2978b50e93806c2a18c1281fc/deflate.c#L696
# Specifically, worked out by assuming the compressed size of a stream cannot be bigger than
#
# uncompressed_size + (uncompressed_size >> 12) + (uncompressed_size >> 14) + (uncompressed_size >> 25) + 7
#
# This is the 0.03% deflate bound for default memLevel of 8 (and wbits), and it so we assume
# that it is a higher memLevel should never increase the size beyond this limit. In fact, higher memLevel
# should result in smaller sizes. This is backed up by experiments on random data.
#
# Also, Python's interaction with zlib is also not consistent between versions of Python
# https://stackoverflow.com/q/76371334/1319998
# so Python could be causing extra deflate-chunks output which could break the limit. However, since we
# enforce memLevel 9 which outputs smaller compressed data, this gives a margin of safety. There is
# probably a lower limit, but would need to be very sure how zlib works with memLevel 9, and Python's
# interaction with it
method = _ZIP_64 if uncompressed_size > 4293656841 or offset > 0xffffffff else _ZIP_32
return (method, _AUTO_UPGRADE_CENTRAL_DIRECTORY, lambda: zlib.compressobj(level=level, memLevel=9, wbits=-zlib.MAX_WBITS))
return method_compressobj


def stream_zip(files, chunk_size=65536, get_compressobj=lambda: zlib.compressobj(wbits=-zlib.MAX_WBITS, level=9)):
Expand Down Expand Up @@ -353,7 +378,7 @@ def _chunks():
return chunks, size, crc_32

for name, modified_at, perms, method, chunks in files:
_method, _get_compress_obj = method(offset, get_compressobj)
_method, _auto_upgrade_central_directory, _get_compress_obj = method(offset, get_compressobj)

name_encoded = name.encode('utf-8')
_raise_if_beyond(len(name_encoded), maximum=0xffff, exception_class=NameLengthOverflowError)
Expand All @@ -377,7 +402,9 @@ def _chunks():
_no_compression_32_local_header_and_data
central_directory.append((yield from data_func(name_encoded, mod_at_encoded, external_attr, _get_compress_obj, evenly_sized(chunks))))

zip_64_central_directory = zip_64_central_directory or _method in (_ZIP_64, _NO_COMPRESSION_64)
zip_64_central_directory = zip_64_central_directory \
or (_auto_upgrade_central_directory is _AUTO_UPGRADE_CENTRAL_DIRECTORY and offset > 0xffffffff) \
or _method in (_ZIP_64, _NO_COMPRESSION_64)

max_central_directory_length, max_central_directory_start_offset, max_central_directory_size = \
(0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff) if zip_64_central_directory else \
Expand Down
100 changes: 99 additions & 1 deletion test_stream_zip.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,13 @@
from zipfile import ZipFile

import pytest
from stream_unzip import stream_unzip
from stream_unzip import UnsupportedZip64Error, stream_unzip

from stream_zip import (
stream_zip,
NO_COMPRESSION_64,
NO_COMPRESSION_32,
ZIP_AUTO,
ZIP_64,
ZIP_32,
CompressedSizeOverflowError,
Expand All @@ -35,6 +36,14 @@ def cwd(new_dir):
os.chdir(old_dir)


def gen_bytes(num):
chunk = b'-' * 100000
while num:
to_yield = min(len(chunk), num)
num -= to_yield
yield chunk[:to_yield]


def test_with_stream_unzip_zip_64():
now = datetime.fromisoformat('2021-01-01 21:01:12')
perms = 0o600
Expand Down Expand Up @@ -91,6 +100,95 @@ def files():
]


def test_with_stream_unzip_auto_small():
now = datetime.fromisoformat('2021-01-01 21:01:12')
perms = 0o600

def files():
yield 'file-1', now, perms, ZIP_AUTO(20000), (b'a' * 10000, b'b' * 10000)
yield 'file-2', now, perms, ZIP_AUTO(2), (b'c', b'd')

assert [(b'file-1', None, b'a' * 10000 + b'b' * 10000), (b'file-2', None, b'cd')] == [
(name, size, b''.join(chunks))
for name, size, chunks in stream_unzip(stream_zip(files()), allow_zip64=False)
]


@pytest.mark.parametrize(
"level",
[
0,
9,
],
)
def test_with_stream_unzip_at_zip_32_limit(level):
now = datetime.fromisoformat('2021-01-01 21:01:12')
perms = 0o600

def files():
yield 'file-1', now, perms, ZIP_AUTO(4293656841, level=level), gen_bytes(4293656841)

assert [(b'file-1', None, 4293656841)] == [
(name, size, sum(len(chunk) for chunk in chunks))
for name, size, chunks in stream_unzip(stream_zip(files()), allow_zip64=False)
]


@pytest.mark.parametrize(
"level",
[
0,
9,
],
)
def test_with_stream_unzip_above_zip_32_size_limit(level):
now = datetime.fromisoformat('2021-01-01 21:01:12')
perms = 0o600

def files():
yield 'file-1', now, perms, ZIP_AUTO(4293656842, level=level), gen_bytes(4293656842)

assert [(b'file-1', None, 4293656842)] == [
(name, size, sum(len(chunk) for chunk in chunks))
for name, size, chunks in stream_unzip(stream_zip(files()))
]

with pytest.raises(UnsupportedZip64Error):
next(iter(stream_unzip(stream_zip(files()), allow_zip64=False)))


def test_with_stream_unzip_above_zip_32_offset_limit():
now = datetime.fromisoformat('2021-01-01 21:01:12')
perms = 0o600

def files():
yield 'file-1', now, perms, ZIP_AUTO(4000000000, level=0), gen_bytes(4000000000)
yield 'file-2', now, perms, ZIP_AUTO(4000000000, level=0), gen_bytes(4000000000)
yield 'file-3', now, perms, ZIP_AUTO(1, level=0), gen_bytes(1)

assert [(b'file-1', None, 4000000000), (b'file-2', None, 4000000000), (b'file-3', None, 1)] == [
(name, size, sum(len(chunk) for chunk in chunks))
for name, size, chunks in stream_unzip(stream_zip(files()))
]

file_1_zip_32 = False
file_2_zip_32 = False
with pytest.raises(UnsupportedZip64Error):
it = iter(stream_unzip(stream_zip(files()), allow_zip64=False))
name, size, chunks = next(it)
for c in chunks:
pass
file_1_zip_32 = True
name, size, chunks = next(it)
for c in chunks:
pass
file_2_zip_32 = True
name, size, chunks = next(it)

assert file_1_zip_32
assert file_2_zip_32


def test_with_stream_unzip_large_easily_compressible():
now = datetime.fromisoformat('2021-01-01 21:01:12')
perms = 0o600
Expand Down

0 comments on commit 0f292c9

Please sign in to comment.