Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix example digests, additional checks, compute_headers_buffer() cleanup #67

Merged
merged 1 commit into from
Mar 30, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file modified test/data/example.warc
Binary file not shown.
Binary file modified test/data/example.warc.gz
Binary file not shown.
70 changes: 70 additions & 0 deletions test/test_check_digest_examples.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
from warcio.cli import main
from warcio import ArchiveIterator
from warcio.warcwriter import BufferWARCWriter

from . import get_test_file
import os

SKIP = ['example-trunc.warc',
'example-iana.org-chunked.warc',
'example-wrong-chunks.warc.gz',
'example-bad-non-chunked.warc.gz',
'example-digest.warc'
]


def pytest_generate_tests(metafunc):
if 'test_filename' in metafunc.fixturenames:
files = [filename for filename in os.listdir(get_test_file('.'))
if filename not in SKIP and filename.endswith(('.warc', '.warc.gz', '.arc', '.arc.gz'))]

metafunc.parametrize('test_filename', files)


class TestExamplesDigest(object):
def check_helper(self, args, expected_exit_value, capsys):
exit_value = None
try:
main(args=args)
except SystemExit as e:
exit_value = e.code
finally:
assert exit_value == expected_exit_value

return capsys.readouterr().out

def test_check_invalid(self, capsys):
filenames = [get_test_file('example-digest.warc')]

args = ['check'] + filenames
value = self.check_helper(args, 1, capsys)
assert value.count('payload digest failed') == 1
assert value.count('WARC-Record-ID') == 1

args = ['check', '-v'] + filenames
value = self.check_helper(args, 1, capsys)
assert value.count('payload digest failed') == 1
assert value.count('digest pass') == 3
assert value.count('WARC-Record-ID') == 4

def test_check_valid(self, capsys):
filenames = [get_test_file('example.warc'), get_test_file('example.warc.gz')]

args = ['check'] + filenames
expected = ''
assert self.check_helper(args, 0, capsys) == expected

args = ['check', '-v'] + filenames
value = self.check_helper(args, 0, capsys)
# two digests per file (payload and block)
assert value.count('digest pass') == 4
assert value.count('WARC-Record-ID') == 12

def test_check_no_invalid_files(self, test_filename, capsys):
args = ['check', '-v', get_test_file(test_filename)]
value = self.check_helper(args, 0, capsys)
assert value.count('digest failed') == 0

# if ARC file, no digests to check, so no passing results
if test_filename.endswith(('.arc', '.arc.gz')):
assert value.count('digest pass') == 0
52 changes: 6 additions & 46 deletions test/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,9 @@ def test_index():
{"length": "353", "offset": "0", "warc-type": "warcinfo", "warc-filename": "temp-20170306040353.warc.gz"}
{"length": "431", "offset": "353", "warc-type": "warcinfo", "warc-filename": "temp-20170306040353.warc.gz"}
{"length": "1228", "offset": "784", "warc-type": "response", "warc-target-uri": "http://example.com/", "http:content-type": "text/html"}
{"length": "526", "offset": "2012", "warc-type": "request", "warc-target-uri": "http://example.com/"}
{"length": "585", "offset": "2538", "warc-type": "revisit", "warc-target-uri": "http://example.com/", "http:content-type": "text/html"}
{"length": "526", "offset": "3123", "warc-type": "request", "warc-target-uri": "http://example.com/"}
{"length": "609", "offset": "2012", "warc-type": "request", "warc-target-uri": "http://example.com/"}
{"length": "586", "offset": "2621", "warc-type": "revisit", "warc-target-uri": "http://example.com/", "http:content-type": "text/html"}
{"length": "609", "offset": "3207", "warc-type": "request", "warc-target-uri": "http://example.com/"}
{"length": "484", "offset": "0", "warc-type": "warcinfo", "warc-filename": "temp-20170306040353.warc.gz"}
{"length": "705", "offset": "488", "warc-type": "warcinfo", "warc-filename": "temp-20170306040353.warc.gz"}
{"length": "1365", "offset": "1197", "warc-type": "response", "warc-target-uri": "http://example.com/", "http:content-type": "text/html"}
Expand Down Expand Up @@ -54,55 +54,15 @@ def test_index_2():
{"offset": "0", "length": "353", "warc-type": "warcinfo", "filename": "example.warc.gz"}
{"offset": "353", "length": "431", "warc-type": "warcinfo", "filename": "example.warc.gz"}
{"offset": "784", "length": "1228", "http:status": "200", "warc-type": "response", "filename": "example.warc.gz"}
{"offset": "2012", "length": "526", "warc-type": "request", "filename": "example.warc.gz"}
{"offset": "2538", "length": "585", "http:status": "200", "warc-type": "revisit", "filename": "example.warc.gz"}
{"offset": "3123", "length": "526", "warc-type": "request", "filename": "example.warc.gz"}
{"offset": "2012", "length": "609", "warc-type": "request", "filename": "example.warc.gz"}
{"offset": "2621", "length": "586", "http:status": "200", "warc-type": "revisit", "filename": "example.warc.gz"}
{"offset": "3207", "length": "609", "warc-type": "request", "filename": "example.warc.gz"}
"""
with patch_stdout() as buff:
res = main(args=args)
assert buff.getvalue() == expected


def check_helper(args, expected_exit_value):
with patch_stdout() as buff:
exit_value = None
try:
main(args=args)
except SystemExit as e:
exit_value = e.code
finally:
assert exit_value == expected_exit_value

return buff.getvalue()


def test_check_valid():
filenames = [get_test_file('example.warc'), get_test_file('example.warc.gz')]

args = ['check'] + filenames
expected = b''
assert check_helper(args, 0) == expected

args = ['check', '-v'] + filenames
value = check_helper(args, 0)
assert value.count(b'digest pass') == 2
assert value.count(b'WARC-Record-ID') == 12


def test_check_invalid():
filenames = [get_test_file('example-digest.warc')]

args = ['check'] + filenames
value = check_helper(args, 1)
assert value.count(b'payload digest failed') == 1
assert value.count(b'WARC-Record-ID') == 1

args = ['check', '-v'] + filenames
value = check_helper(args, 1)
assert value.count(b'payload digest failed') == 1
assert value.count(b'digest pass') == 3
assert value.count(b'WARC-Record-ID') == 4


def test_recompress_non_chunked():
with named_temp() as temp:
Expand Down
7 changes: 7 additions & 0 deletions warcio/statusandheaders.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,13 @@ def add_range(self, start, part_len, total_len):
self.replace_header('Accept-Ranges', 'bytes')
return self

def compute_headers_buffer(self, header_filter=None):
"""
Set buffer representing headers
"""
# HTTP headers %-encoded as ascii (see to_ascii_bytes for more info)
self.headers_buff = self.to_ascii_bytes(header_filter)

def __repr__(self):
return "StatusAndHeaders(protocol = '{0}', statusline = '{1}', \
headers = {2})".format(self.protocol, self.statusline, self.headers)
Expand Down
12 changes: 5 additions & 7 deletions warcio/warcwriter.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,10 @@ def ensure_digest(self, record, block=True, payload=True):
pos = 0
temp_file = self._create_temp_file()

if block_digester and record.http_headers and record.http_headers.headers_buff:
if block_digester and record.http_headers:
if not record.http_headers.headers_buff:
record.http_headers.compute_headers_buffer(self.header_filter)

block_digester.update(record.http_headers.headers_buff)

for buf in self._iter_stream(record.raw_stream):
Expand Down Expand Up @@ -235,17 +238,12 @@ def _init_warc_headers(self, uri, record_type, warc_headers_dict):

return warc_headers

def _set_header_buff(self, record):
# HTTP headers %-encoded as ascii (see to_ascii_bytes for more info)
headers_buff = record.http_headers.to_ascii_bytes(self.header_filter)
record.http_headers.headers_buff = headers_buff

def _write_warc_record(self, out, record):
if self.gzip:
out = GzippingWrapper(out)

if record.http_headers:
self._set_header_buff(record)
record.http_headers.compute_headers_buffer(self.header_filter)

# Content-Length is None/unknown
# Fix record by: buffering and recomputing all digests and length
Expand Down