diff --git a/test/data/example.warc b/test/data/example.warc index 4bc3089a..3d0f5ed8 100644 Binary files a/test/data/example.warc and b/test/data/example.warc differ diff --git a/test/data/example.warc.gz b/test/data/example.warc.gz index 235040f4..ed255396 100644 Binary files a/test/data/example.warc.gz and b/test/data/example.warc.gz differ diff --git a/test/test_check_digest_examples.py b/test/test_check_digest_examples.py new file mode 100644 index 00000000..a559e895 --- /dev/null +++ b/test/test_check_digest_examples.py @@ -0,0 +1,70 @@ +from warcio.cli import main +from warcio import ArchiveIterator +from warcio.warcwriter import BufferWARCWriter + +from . import get_test_file +import os + +SKIP = ['example-trunc.warc', + 'example-iana.org-chunked.warc', + 'example-wrong-chunks.warc.gz', + 'example-bad-non-chunked.warc.gz', + 'example-digest.warc' + ] + + +def pytest_generate_tests(metafunc): + if 'test_filename' in metafunc.fixturenames: + files = [filename for filename in os.listdir(get_test_file('.')) + if filename not in SKIP and filename.endswith(('.warc', '.warc.gz', '.arc', '.arc.gz'))] + + metafunc.parametrize('test_filename', files) + + +class TestExamplesDigest(object): + def check_helper(self, args, expected_exit_value, capsys): + exit_value = None + try: + main(args=args) + except SystemExit as e: + exit_value = e.code + finally: + assert exit_value == expected_exit_value + + return capsys.readouterr().out + + def test_check_invalid(self, capsys): + filenames = [get_test_file('example-digest.warc')] + + args = ['check'] + filenames + value = self.check_helper(args, 1, capsys) + assert value.count('payload digest failed') == 1 + assert value.count('WARC-Record-ID') == 1 + + args = ['check', '-v'] + filenames + value = self.check_helper(args, 1, capsys) + assert value.count('payload digest failed') == 1 + assert value.count('digest pass') == 3 + assert value.count('WARC-Record-ID') == 4 + + def test_check_valid(self, capsys): + filenames = [get_test_file('example.warc'), get_test_file('example.warc.gz')] + + args = ['check'] + filenames + expected = '' + assert self.check_helper(args, 0, capsys) == expected + + args = ['check', '-v'] + filenames + value = self.check_helper(args, 0, capsys) + # two digests per file (payload and block) + assert value.count('digest pass') == 4 + assert value.count('WARC-Record-ID') == 12 + + def test_check_no_invalid_files(self, test_filename, capsys): + args = ['check', '-v', get_test_file(test_filename)] + value = self.check_helper(args, 0, capsys) + assert value.count('digest failed') == 0 + + # if ARC file, no digests to check, so no passing results + if test_filename.endswith(('.arc', '.arc.gz')): + assert value.count('digest pass') == 0 diff --git a/test/test_cli.py b/test/test_cli.py index dc643ec4..a9bf18ed 100644 --- a/test/test_cli.py +++ b/test/test_cli.py @@ -24,9 +24,9 @@ def test_index(): {"length": "353", "offset": "0", "warc-type": "warcinfo", "warc-filename": "temp-20170306040353.warc.gz"} {"length": "431", "offset": "353", "warc-type": "warcinfo", "warc-filename": "temp-20170306040353.warc.gz"} {"length": "1228", "offset": "784", "warc-type": "response", "warc-target-uri": "http://example.com/", "http:content-type": "text/html"} -{"length": "526", "offset": "2012", "warc-type": "request", "warc-target-uri": "http://example.com/"} -{"length": "585", "offset": "2538", "warc-type": "revisit", "warc-target-uri": "http://example.com/", "http:content-type": "text/html"} -{"length": "526", "offset": "3123", "warc-type": "request", "warc-target-uri": "http://example.com/"} +{"length": "609", "offset": "2012", "warc-type": "request", "warc-target-uri": "http://example.com/"} +{"length": "586", "offset": "2621", "warc-type": "revisit", "warc-target-uri": "http://example.com/", "http:content-type": "text/html"} +{"length": "609", "offset": "3207", "warc-type": "request", "warc-target-uri": "http://example.com/"} {"length": "484", "offset": "0", "warc-type": "warcinfo", "warc-filename": "temp-20170306040353.warc.gz"} {"length": "705", "offset": "488", "warc-type": "warcinfo", "warc-filename": "temp-20170306040353.warc.gz"} {"length": "1365", "offset": "1197", "warc-type": "response", "warc-target-uri": "http://example.com/", "http:content-type": "text/html"} @@ -54,55 +54,15 @@ def test_index_2(): {"offset": "0", "length": "353", "warc-type": "warcinfo", "filename": "example.warc.gz"} {"offset": "353", "length": "431", "warc-type": "warcinfo", "filename": "example.warc.gz"} {"offset": "784", "length": "1228", "http:status": "200", "warc-type": "response", "filename": "example.warc.gz"} -{"offset": "2012", "length": "526", "warc-type": "request", "filename": "example.warc.gz"} -{"offset": "2538", "length": "585", "http:status": "200", "warc-type": "revisit", "filename": "example.warc.gz"} -{"offset": "3123", "length": "526", "warc-type": "request", "filename": "example.warc.gz"} +{"offset": "2012", "length": "609", "warc-type": "request", "filename": "example.warc.gz"} +{"offset": "2621", "length": "586", "http:status": "200", "warc-type": "revisit", "filename": "example.warc.gz"} +{"offset": "3207", "length": "609", "warc-type": "request", "filename": "example.warc.gz"} """ with patch_stdout() as buff: res = main(args=args) assert buff.getvalue() == expected -def check_helper(args, expected_exit_value): - with patch_stdout() as buff: - exit_value = None - try: - main(args=args) - except SystemExit as e: - exit_value = e.code - finally: - assert exit_value == expected_exit_value - - return buff.getvalue() - - -def test_check_valid(): - filenames = [get_test_file('example.warc'), get_test_file('example.warc.gz')] - - args = ['check'] + filenames - expected = b'' - assert check_helper(args, 0) == expected - - args = ['check', '-v'] + filenames - value = check_helper(args, 0) - assert value.count(b'digest pass') == 2 - assert value.count(b'WARC-Record-ID') == 12 - - -def test_check_invalid(): - filenames = [get_test_file('example-digest.warc')] - - args = ['check'] + filenames - value = check_helper(args, 1) - assert value.count(b'payload digest failed') == 1 - assert value.count(b'WARC-Record-ID') == 1 - - args = ['check', '-v'] + filenames - value = check_helper(args, 1) - assert value.count(b'payload digest failed') == 1 - assert value.count(b'digest pass') == 3 - assert value.count(b'WARC-Record-ID') == 4 - def test_recompress_non_chunked(): with named_temp() as temp: diff --git a/warcio/statusandheaders.py b/warcio/statusandheaders.py index e0241361..7817e325 100644 --- a/warcio/statusandheaders.py +++ b/warcio/statusandheaders.py @@ -110,6 +110,13 @@ def add_range(self, start, part_len, total_len): self.replace_header('Accept-Ranges', 'bytes') return self + def compute_headers_buffer(self, header_filter=None): + """ + Set buffer representing headers + """ + # HTTP headers %-encoded as ascii (see to_ascii_bytes for more info) + self.headers_buff = self.to_ascii_bytes(header_filter) + def __repr__(self): return "StatusAndHeaders(protocol = '{0}', statusline = '{1}', \ headers = {2})".format(self.protocol, self.statusline, self.headers) diff --git a/warcio/warcwriter.py b/warcio/warcwriter.py index 3dbdc38c..260b2372 100644 --- a/warcio/warcwriter.py +++ b/warcio/warcwriter.py @@ -93,7 +93,10 @@ def ensure_digest(self, record, block=True, payload=True): pos = 0 temp_file = self._create_temp_file() - if block_digester and record.http_headers and record.http_headers.headers_buff: + if block_digester and record.http_headers: + if not record.http_headers.headers_buff: + record.http_headers.compute_headers_buffer(self.header_filter) + block_digester.update(record.http_headers.headers_buff) for buf in self._iter_stream(record.raw_stream): @@ -235,17 +238,12 @@ def _init_warc_headers(self, uri, record_type, warc_headers_dict): return warc_headers - def _set_header_buff(self, record): - # HTTP headers %-encoded as ascii (see to_ascii_bytes for more info) - headers_buff = record.http_headers.to_ascii_bytes(self.header_filter) - record.http_headers.headers_buff = headers_buff - def _write_warc_record(self, out, record): if self.gzip: out = GzippingWrapper(out) if record.http_headers: - self._set_header_buff(record) + record.http_headers.compute_headers_buffer(self.header_filter) # Content-Length is None/unknown # Fix record by: buffering and recomputing all digests and length