Skip to content

Commit

Permalink
digest checking improvements: (#67)
Browse files Browse the repository at this point in the history
- tests: fix incorrect digests for example.warc, example.warc.gz
- ensure each example file is checked to ensure no invalid digests (and no checks for arc files)
- move all digest checking to test_check_digest_examples.py

headers buffer cleanup: move headers buffer computation directly into StatusAndHeaders via compute_headers_buffer() instead of in warcwriter
  • Loading branch information
ikreymer committed Mar 30, 2019
1 parent 7f533c8 commit a9485eb
Show file tree
Hide file tree
Showing 6 changed files with 88 additions and 53 deletions.
Binary file modified test/data/example.warc
Binary file not shown.
Binary file modified test/data/example.warc.gz
Binary file not shown.
70 changes: 70 additions & 0 deletions test/test_check_digest_examples.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
from warcio.cli import main
from warcio import ArchiveIterator
from warcio.warcwriter import BufferWARCWriter

from . import get_test_file
import os

SKIP = ['example-trunc.warc',
'example-iana.org-chunked.warc',
'example-wrong-chunks.warc.gz',
'example-bad-non-chunked.warc.gz',
'example-digest.warc'
]


def pytest_generate_tests(metafunc):
if 'test_filename' in metafunc.fixturenames:
files = [filename for filename in os.listdir(get_test_file('.'))
if filename not in SKIP and filename.endswith(('.warc', '.warc.gz', '.arc', '.arc.gz'))]

metafunc.parametrize('test_filename', files)


class TestExamplesDigest(object):
def check_helper(self, args, expected_exit_value, capsys):
exit_value = None
try:
main(args=args)
except SystemExit as e:
exit_value = e.code
finally:
assert exit_value == expected_exit_value

return capsys.readouterr().out

def test_check_invalid(self, capsys):
filenames = [get_test_file('example-digest.warc')]

args = ['check'] + filenames
value = self.check_helper(args, 1, capsys)
assert value.count('payload digest failed') == 1
assert value.count('WARC-Record-ID') == 1

args = ['check', '-v'] + filenames
value = self.check_helper(args, 1, capsys)
assert value.count('payload digest failed') == 1
assert value.count('digest pass') == 3
assert value.count('WARC-Record-ID') == 4

def test_check_valid(self, capsys):
filenames = [get_test_file('example.warc'), get_test_file('example.warc.gz')]

args = ['check'] + filenames
expected = ''
assert self.check_helper(args, 0, capsys) == expected

args = ['check', '-v'] + filenames
value = self.check_helper(args, 0, capsys)
# two digests per file (payload and block)
assert value.count('digest pass') == 4
assert value.count('WARC-Record-ID') == 12

def test_check_no_invalid_files(self, test_filename, capsys):
args = ['check', '-v', get_test_file(test_filename)]
value = self.check_helper(args, 0, capsys)
assert value.count('digest failed') == 0

# if ARC file, no digests to check, so no passing results
if test_filename.endswith(('.arc', '.arc.gz')):
assert value.count('digest pass') == 0
52 changes: 6 additions & 46 deletions test/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,9 @@ def test_index():
{"length": "353", "offset": "0", "warc-type": "warcinfo", "warc-filename": "temp-20170306040353.warc.gz"}
{"length": "431", "offset": "353", "warc-type": "warcinfo", "warc-filename": "temp-20170306040353.warc.gz"}
{"length": "1228", "offset": "784", "warc-type": "response", "warc-target-uri": "http://example.com/", "http:content-type": "text/html"}
{"length": "526", "offset": "2012", "warc-type": "request", "warc-target-uri": "http://example.com/"}
{"length": "585", "offset": "2538", "warc-type": "revisit", "warc-target-uri": "http://example.com/", "http:content-type": "text/html"}
{"length": "526", "offset": "3123", "warc-type": "request", "warc-target-uri": "http://example.com/"}
{"length": "609", "offset": "2012", "warc-type": "request", "warc-target-uri": "http://example.com/"}
{"length": "586", "offset": "2621", "warc-type": "revisit", "warc-target-uri": "http://example.com/", "http:content-type": "text/html"}
{"length": "609", "offset": "3207", "warc-type": "request", "warc-target-uri": "http://example.com/"}
{"length": "484", "offset": "0", "warc-type": "warcinfo", "warc-filename": "temp-20170306040353.warc.gz"}
{"length": "705", "offset": "488", "warc-type": "warcinfo", "warc-filename": "temp-20170306040353.warc.gz"}
{"length": "1365", "offset": "1197", "warc-type": "response", "warc-target-uri": "http://example.com/", "http:content-type": "text/html"}
Expand Down Expand Up @@ -54,55 +54,15 @@ def test_index_2():
{"offset": "0", "length": "353", "warc-type": "warcinfo", "filename": "example.warc.gz"}
{"offset": "353", "length": "431", "warc-type": "warcinfo", "filename": "example.warc.gz"}
{"offset": "784", "length": "1228", "http:status": "200", "warc-type": "response", "filename": "example.warc.gz"}
{"offset": "2012", "length": "526", "warc-type": "request", "filename": "example.warc.gz"}
{"offset": "2538", "length": "585", "http:status": "200", "warc-type": "revisit", "filename": "example.warc.gz"}
{"offset": "3123", "length": "526", "warc-type": "request", "filename": "example.warc.gz"}
{"offset": "2012", "length": "609", "warc-type": "request", "filename": "example.warc.gz"}
{"offset": "2621", "length": "586", "http:status": "200", "warc-type": "revisit", "filename": "example.warc.gz"}
{"offset": "3207", "length": "609", "warc-type": "request", "filename": "example.warc.gz"}
"""
with patch_stdout() as buff:
res = main(args=args)
assert buff.getvalue() == expected


def check_helper(args, expected_exit_value):
with patch_stdout() as buff:
exit_value = None
try:
main(args=args)
except SystemExit as e:
exit_value = e.code
finally:
assert exit_value == expected_exit_value

return buff.getvalue()


def test_check_valid():
filenames = [get_test_file('example.warc'), get_test_file('example.warc.gz')]

args = ['check'] + filenames
expected = b''
assert check_helper(args, 0) == expected

args = ['check', '-v'] + filenames
value = check_helper(args, 0)
assert value.count(b'digest pass') == 2
assert value.count(b'WARC-Record-ID') == 12


def test_check_invalid():
filenames = [get_test_file('example-digest.warc')]

args = ['check'] + filenames
value = check_helper(args, 1)
assert value.count(b'payload digest failed') == 1
assert value.count(b'WARC-Record-ID') == 1

args = ['check', '-v'] + filenames
value = check_helper(args, 1)
assert value.count(b'payload digest failed') == 1
assert value.count(b'digest pass') == 3
assert value.count(b'WARC-Record-ID') == 4


def test_recompress_non_chunked():
with named_temp() as temp:
Expand Down
7 changes: 7 additions & 0 deletions warcio/statusandheaders.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,13 @@ def add_range(self, start, part_len, total_len):
self.replace_header('Accept-Ranges', 'bytes')
return self

def compute_headers_buffer(self, header_filter=None):
"""
Set buffer representing headers
"""
# HTTP headers %-encoded as ascii (see to_ascii_bytes for more info)
self.headers_buff = self.to_ascii_bytes(header_filter)

def __repr__(self):
return "StatusAndHeaders(protocol = '{0}', statusline = '{1}', \
headers = {2})".format(self.protocol, self.statusline, self.headers)
Expand Down
12 changes: 5 additions & 7 deletions warcio/warcwriter.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,10 @@ def ensure_digest(self, record, block=True, payload=True):
pos = 0
temp_file = self._create_temp_file()

if block_digester and record.http_headers and record.http_headers.headers_buff:
if block_digester and record.http_headers:
if not record.http_headers.headers_buff:
record.http_headers.compute_headers_buffer(self.header_filter)

block_digester.update(record.http_headers.headers_buff)

for buf in self._iter_stream(record.raw_stream):
Expand Down Expand Up @@ -239,17 +242,12 @@ def _init_warc_headers(self, uri, record_type, warc_headers_dict):

return warc_headers

def _set_header_buff(self, record):
# HTTP headers %-encoded as ascii (see to_ascii_bytes for more info)
headers_buff = record.http_headers.to_ascii_bytes(self.header_filter)
record.http_headers.headers_buff = headers_buff

def _write_warc_record(self, out, record):
if self.gzip:
out = GzippingWrapper(out)

if record.http_headers:
self._set_header_buff(record)
record.http_headers.compute_headers_buffer(self.header_filter)

# Content-Length is None/unknown
# Fix record by: buffering and recomputing all digests and length
Expand Down

0 comments on commit a9485eb

Please sign in to comment.