Skip to content

Commit

Permalink
zip: Add format decoder
Browse files Browse the repository at this point in the history
  • Loading branch information
wader committed Nov 18, 2021
1 parent 5344c7e commit d838d2f
Show file tree
Hide file tree
Showing 19 changed files with 1,789 additions and 186 deletions.
1 change: 1 addition & 0 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@
"mfhd",
"mfra",
"mfro",
"mitchellh",
"MJPEG",
"moof",
"moov",
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ cp fq /usr/local/bin

[./formats_list.jq]: sh-start

aac_frame, adts, adts_frame, apev2, av1_ccr, av1_frame, av1_obu, avc_annexb, avc_au, avc_dcr, avc_nalu, avc_pps, avc_sei, avc_sps, bzip2, dns, elf, exif, flac, flac_frame, flac_metadatablock, flac_metadatablocks, flac_picture, flac_streaminfo, gif, gzip, hevc_annexb, hevc_au, hevc_dcr, hevc_nalu, icc_profile, id3v1, id3v11, id3v2, jpeg, json, matroska, mp3, mp3_frame, mp4, mpeg_asc, mpeg_es, mpeg_pes, mpeg_pes_packet, mpeg_spu, mpeg_ts, ogg, ogg_page, opus_packet, png, protobuf, protobuf_widevine, pssh_playready, raw, tar, tiff, vorbis_comment, vorbis_packet, vp8_frame, vp9_cfm, vp9_frame, vpx_ccr, wav, webp, xing
aac_frame, adts, adts_frame, apev2, av1_ccr, av1_frame, av1_obu, avc_annexb, avc_au, avc_dcr, avc_nalu, avc_pps, avc_sei, avc_sps, bzip2, dns, elf, exif, flac, flac_frame, flac_metadatablock, flac_metadatablocks, flac_picture, flac_streaminfo, gif, gzip, hevc_annexb, hevc_au, hevc_dcr, hevc_nalu, icc_profile, id3v1, id3v11, id3v2, jpeg, json, matroska, mp3, mp3_frame, mp4, mpeg_asc, mpeg_es, mpeg_pes, mpeg_pes_packet, mpeg_spu, mpeg_ts, ogg, ogg_page, opus_packet, png, protobuf, protobuf_widevine, pssh_playready, raw, tar, tiff, vorbis_comment, vorbis_packet, vp8_frame, vp9_cfm, vp9_frame, vpx_ccr, wav, webp, xing, zip

[#]: sh-end

Expand Down
3 changes: 2 additions & 1 deletion doc/formats.md
Original file line number Diff line number Diff line change
Expand Up @@ -69,8 +69,9 @@
|`wav` |WAV&nbsp;file |<sub>`id3v2` `id3v1` `id3v11`</sub>|
|`webp` |WebP&nbsp;image |<sub>`vp8_frame`</sub>|
|`xing` |Xing&nbsp;header |<sub></sub>|
|`zip` |ZIP&nbsp;archive |<sub>`probe`</sub>|
|`image` |Group |<sub>`gif` `jpeg` `mp4` `png` `tiff` `webp`</sub>|
|`probe` |Group |<sub>`adts` `bzip2` `elf` `flac` `gif` `gzip` `jpeg` `json` `matroska` `mp3` `mp4` `mpeg_ts` `ogg` `png` `tar` `tiff` `wav` `webp`</sub>|
|`probe` |Group |<sub>`adts` `bzip2` `elf` `flac` `gif` `gzip` `jpeg` `json` `matroska` `mp3` `mp4` `mpeg_ts` `ogg` `png` `tar` `tiff` `wav` `webp` `zip`</sub>|

[#]: sh-end

Expand Down
384 changes: 202 additions & 182 deletions doc/formats.svg
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
1 change: 1 addition & 0 deletions format/all/all.fqtest
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ $ fq -n _registry.groups.probe
"tar",
"tiff",
"webp",
"zip",
"mpeg_ts",
"wav",
"mp3",
Expand Down
1 change: 1 addition & 0 deletions format/all/all.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,4 +30,5 @@ import (
_ "github.com/wader/fq/format/vpx"
_ "github.com/wader/fq/format/wav"
_ "github.com/wader/fq/format/webp"
_ "github.com/wader/fq/format/zip"
)
1 change: 1 addition & 0 deletions format/format.go
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@ const (
VPX_CCR = "vpx_ccr"
WAV = "wav"
WEBP = "webp"
ZIP = "zip"
)

// below are data types used to communicate between formats <FormatName>In/Out
Expand Down
414 changes: 414 additions & 0 deletions format/zip/testdata/test-macos.fqtest

Large diffs are not rendered by default.

Binary file added format/zip/testdata/test-macos.zip
Binary file not shown.
1 change: 1 addition & 0 deletions format/zip/testdata/test/a.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
1 change: 1 addition & 0 deletions format/zip/testdata/test/a/a.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
aaaa
405 changes: 405 additions & 0 deletions format/zip/testdata/test0.fqtest

Large diffs are not rendered by default.

Binary file added format/zip/testdata/test0.zip
Binary file not shown.
403 changes: 403 additions & 0 deletions format/zip/testdata/test9.fqtest

Large diffs are not rendered by default.

Binary file added format/zip/testdata/test9.zip
Binary file not shown.
338 changes: 338 additions & 0 deletions format/zip/zip.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,338 @@
package zip

// https://pkware.cachefly.net/webdocs/casestudies/APPNOTE.TXT
// https://opensource.apple.com/source/zip/zip-6/unzip/unzip/proginfo/extra.fld

import (
"bytes"
"compress/flate"
"io"

"github.com/wader/fq/format"
"github.com/wader/fq/format/registry"
"github.com/wader/fq/pkg/bitio"
"github.com/wader/fq/pkg/decode"
)

var probeFormat decode.Group

func init() {
registry.MustRegister(decode.Format{
Name: format.ZIP,
Description: "ZIP archive",
Groups: []string{format.PROBE},
DecodeFn: zipDecode,
Dependencies: []decode.Dependency{
{Names: []string{format.PROBE}, Group: &probeFormat},
},
})
}

const (
compressionMethodNone = 0
compressionMethodShrunk = 1
compressionMethodReducedCompressionFactor1 = 2
compressionMethodReducedCompressionFactor2 = 3
compressionMethodReducedCompressionFactor3 = 4
compressionMethodReducedCompressionFactor4 = 5
compressionMethodImploded = 6
compressionMethodDeflated = 8
compressionMethodEnhancedDeflated = 9
compressionMethodPKWareDCLImploded = 10
compressionMethodBzip2 = 12
compressionMethodLZMA = 14
compressionMethodIBMTERSE = 18
compressionMethodIBMLZ77z = 19
compressionMethodPPMd = 98
)

var compressionMethodMap = decode.UToStr{
compressionMethodNone: "None",
compressionMethodShrunk: "Shrunk",
compressionMethodReducedCompressionFactor1: "ReducedCompressionFactor1",
compressionMethodReducedCompressionFactor2: "ReducedCompressionFactor2",
compressionMethodReducedCompressionFactor3: "ReducedCompressionFactor3",
compressionMethodReducedCompressionFactor4: "ReducedCompressionFactor4",
compressionMethodImploded: "Imploded",
compressionMethodDeflated: "Deflated",
compressionMethodEnhancedDeflated: "EnhancedDeflated",
compressionMethodPKWareDCLImploded: "PKWareDCLImploded",
compressionMethodBzip2: "Bzip2",
compressionMethodLZMA: "LZMA",
compressionMethodIBMTERSE: "IBMTERSE",
compressionMethodIBMLZ77z: "IBMLZ77z",
compressionMethodPPMd: "PPMd",
}

var (
centralDirectorySignature = []byte("PK\x01\x02")
endOfCentralDirectorySignature = []byte("PK\x05\x06")
endOfCentralDirectorySignatureN = 0x06054b50
localFileSignature = []byte("PK\x03\x04")
dataIndicatorSignature = []byte("PK\x07\x08")
)

var headerIDMap = decode.UToScalar{
0x0001: {Description: "ZIP64 extended information extra field"},
0x0007: {Description: "AV Info"},
0x0009: {Description: "OS/2 extended attributes"},
0x000a: {Description: "NTFS (Win9x/WinNT FileTimes)"},
0x000c: {Description: "OpenVMS"},
0x000d: {Description: "Unix"},
0x000f: {Description: "Patch Descriptor"},
0x0014: {Description: "PKCS#7 Store for X.509 Certificates"},
0x0015: {Description: "X.509 Certificate ID and Signature for individual file"},
0x0016: {Description: "X.509 Certificate ID for Central Directory"},
0x0065: {Description: "IBM S/390 attributes - uncompressed"},
0x0066: {Description: "IBM S/390 attributes - compressed"},
0x07c8: {Description: "Info-ZIP Macintosh (old, J. Lee)"},
0x2605: {Description: "ZipIt Macintosh (first version)"},
0x2705: {Description: "ZipIt Macintosh v 1.3.5 and newer (w/o full filename)"},
0x334d: {Description: "Info-ZIP Macintosh (new, D. Haase's 'Mac3' field )"},
0x4154: {Description: "Tandem NSK"},
0x4341: {Description: "Acorn/SparkFS (David Pilling)"},
0x4453: {Description: "Windows NT security descriptor (binary ACL)"},
0x4704: {Description: "VM/CMS"},
0x470f: {Description: "MVS"},
// "inofficial" in original table
//nolint:misspell
0x4854: {Description: "Theos, old inofficial port"},
0x4b46: {Description: "FWKCS MD5 (see below)"},
0x4c41: {Description: "OS/2 access control list (text ACL)"},
0x4d49: {Description: "Info-ZIP OpenVMS (obsolete)"},
0x4d63: {Description: "Macintosh SmartZIP, by Macro Bambini"},
0x4f4c: {Description: "Xceed original location extra field"},
0x5356: {Description: "AOS/VS (binary ACL)"},
0x5455: {Description: "extended timestamp"},
0x5855: {Description: "Info-ZIP Unix (original; also OS/2, NT, etc.)"},
0x554e: {Description: "Xceed unicode extra field"},
0x6542: {Description: "BeOS (BeBox, PowerMac, etc.)"},
0x6854: {Description: "Theos"},
0x756e: {Description: "ASi Unix"},
0x7855: {Description: "Info-ZIP Unix (new)"},
0x7875: {Description: "UNIX UID/GID"},
0xfb4a: {Description: "SMS/QDOS"},
}

// "MS-DOS uses year values relative to 1980 and 2 second precision."
func fieldMSDOSTime(d *decode.D) {
d.FieldU5("hours")
d.FieldU6("minutes")
d.FieldU5("seconds")
}

func fieldMSDOSDate(d *decode.D) {
d.FieldU7("year")
d.FieldU4("month")
d.FieldU5("day")
}

func zipDecode(d *decode.D, in interface{}) interface{} {
// TODO: just decode instead?
if !bytes.Equal(d.PeekBytes(4), []byte("PK\x03\x04")) {
d.Errorf("expected PK header")
}

d.Endian = decode.LittleEndian

d.SeekAbs(d.Len())

// TODO: better EOCD probe
p, _, err := d.TryPeekFind(32, -8, -10000, func(v uint64) bool {
return v == uint64(endOfCentralDirectorySignatureN)
})
if err != nil {
d.Fatalf("can't find end of central directory")
}
d.SeekAbs(d.Len() + p)

var offsetCD uint64
var sizeCD uint64
var diskNr uint64

d.FieldStruct("end_of_central_directory", func(d *decode.D) {
d.FieldRawLen("signature", 4*8, d.ValidateBitBuf(endOfCentralDirectorySignature))
diskNr = d.FieldU16("disk_nr")
d.FieldU16("central_directory_start_disk_nr")
d.FieldU16("nr_of_central_directory_records_on_disk")
d.FieldU16("nr_of_central_directory_records")
sizeCD = d.FieldU32("size_of_central directory")
offsetCD = d.FieldU32("offset_of_start_of_central_directory")
commentLength := d.FieldU16("comment_length")
d.FieldUTF8("comment", int(commentLength))
})

var localFileOffsets []uint64

d.SeekAbs(int64(offsetCD) * 8)
d.FieldArray("central_directories", func(d *decode.D) {
d.LenFn(int64(sizeCD)*8, func(d *decode.D) {
for !d.End() {
d.FieldStruct("central_directory", func(d *decode.D) {
d.FieldRawLen("signature", 4*8, d.ValidateBitBuf(centralDirectorySignature))
d.FieldU16("version_made_by")
d.FieldU16("version_needed")
d.FieldStruct("flags", func(d *decode.D) {
// TODO: 16LE, should have some kind of native endian flag reader helper?
d.FieldU1("unused0")
d.FieldBool("strong_encryption")
d.FieldBool("compressed_patched_data")
d.FieldBool("enhanced_deflation")
d.FieldBool("data_descriptor")
d.FieldBool("compression0")
d.FieldBool("compression1")
d.FieldBool("encrypted")

d.FieldU2("reserved0")
d.FieldBool("mask_header_values")
d.FieldBool("reserved1")
d.FieldBool("language_encoding")
d.FieldU3("unused1")
})
d.FieldU16("compression_method", d.MapUToStrSym(compressionMethodMap))
d.FieldStruct("last_modification_date", fieldMSDOSTime)
d.FieldStruct("last_modification_time", fieldMSDOSDate)
d.FieldU32("crc32_uncompressed", d.Hex)
d.FieldU32("compressed_size")
d.FieldU32("uncompressed_size")
fileNameLength := d.FieldU16("file_name_length")
extraFieldLength := d.FieldU16("extra_field_length")
fileCommentLength := d.FieldU16("file_comment_length")
diskNrStart := d.FieldU16("disk_number_where_file_starts")
d.FieldU16("internal_file_attributes")
d.FieldU32("external_file_attributes")
localFileOffset := d.FieldU32("relative_offset_of_local_file_header")
d.FieldUTF8("file_name", int(fileNameLength))
d.FieldArray("extra_fields", func(d *decode.D) {
d.LenFn(int64(extraFieldLength)*8, func(d *decode.D) {
for !d.End() {
d.FieldStruct("extra_field", func(d *decode.D) {
d.FieldU16("header_id", d.MapUToScalar(headerIDMap), d.Hex)
dataSize := d.FieldU16("data_size")
d.FieldRawLen("data", int64(dataSize)*8)
})
}
})
})
d.FieldUTF8("file_comment", int(fileCommentLength))

if diskNrStart == diskNr {
localFileOffsets = append(localFileOffsets, localFileOffset)
}
})
}
})
})

d.FieldArray("local_files", func(d *decode.D) {
for _, o := range localFileOffsets {
d.SeekAbs(int64(o) * 8)
d.FieldStruct("local_file", func(d *decode.D) {
var hasDataDescriptor bool
d.FieldRawLen("signature", 4*8, d.ValidateBitBuf(localFileSignature))
d.FieldU16("version_needed")
d.FieldStruct("flags", func(d *decode.D) {
// TODO: 16LE, should have some kind of native endian flag reader helper?
d.FieldU1("unused0")
d.FieldBool("strong_encryption")
d.FieldBool("compressed_patched_data")
d.FieldBool("enhanced_deflation")
hasDataDescriptor = d.FieldBool("data_descriptor")
d.FieldBool("compression0")
d.FieldBool("compression1")
d.FieldBool("encrypted")

d.FieldU2("reserved0")
d.FieldBool("mask_header_values")
d.FieldBool("reserved1")
d.FieldBool("language_encoding")
d.FieldU3("unused1")
})
compressionMethod := d.FieldU16("compression_method", d.MapUToStrSym(compressionMethodMap))
d.FieldStruct("last_modification_date", fieldMSDOSTime)
d.FieldStruct("last_modification_time", fieldMSDOSDate)
d.FieldU32("crc32_uncompressed", d.Hex)
compressedSize := d.FieldU32("compressed_size")
d.FieldU32("uncompressed_size")
fileNameLength := d.FieldU16("file_name_length")
extraFieldLength := d.FieldU16("extra_field_length")
d.FieldUTF8("file_name", int(fileNameLength))
d.FieldArray("extra_fields", func(d *decode.D) {
d.LenFn(int64(extraFieldLength)*8, func(d *decode.D) {
for !d.End() {
d.FieldStruct("extra_field", func(d *decode.D) {
d.FieldU16("header_id", d.MapUToScalar(headerIDMap), d.Hex)
dataSize := d.FieldU16("data_size")
d.FieldRawLen("data", int64(dataSize)*8)
})
}
})
})

compressedLimit := int64(compressedSize) * 8
if compressedLimit == 0 {
compressedLimit = d.BitsLeft()
}

compressedStart := d.Pos()

d.LenFn(compressedLimit, func(d *decode.D) {
if compressionMethod == compressionMethodNone {
d.FieldRawLen("uncompressed", int64(compressedSize)*8)
return
}

var decompressR io.Reader
compressedBB := d.BitBufRange(d.Pos(), d.BitsLeft())
switch compressionMethod {
case compressionMethodDeflated:
// *bitio.Buffer implements io.ByteReader so hat deflate don't do own
// buffering and might read more than needed messing up knowing compressed size
decompressR = flate.NewReader(compressedBB)
}

if decompressR != nil {
uncompressed := &bytes.Buffer{}
if _, err := d.Copy(uncompressed, decompressR); err != nil {
d.IOPanic(err)
}
uncompressedBB := bitio.NewBufferFromBytes(uncompressed.Bytes(), -1)
dv, _, _ := d.FieldTryFormatBitBuf("uncompressed", uncompressedBB, probeFormat, nil)
if dv == nil {
d.FieldRootBitBuf("uncompressed", uncompressedBB)
}

// no compressed size, is a streaming zip, figure out size by checking what
// position compressed buffer ended at
if compressedSize == 0 {
pos, err := compressedBB.Pos()
if err != nil {
d.IOPanic(err)
}
compressedSize = uint64(pos) / 8
}
}

if compressedSize != 0 {
d.FieldRawLen("compressed", int64(compressedSize)*8)
}
})

d.SeekAbs(compressedStart + int64(compressedSize*8))

if hasDataDescriptor {
d.FieldStruct("data_indicator", func(d *decode.D) {
if bytes.Equal(d.PeekBytes(4), dataIndicatorSignature) {
d.FieldRawLen("signature", 4*8, d.ValidateBitBuf(dataIndicatorSignature))
}
d.FieldU32("crc32_uncompressed", d.Hex)
d.FieldU32("compressed_size")
d.FieldU32("uncompressed_size")
})
}
})
}
})

return nil
}

0 comments on commit d838d2f

Please sign in to comment.